source: main/trunk/greenstone2/runtime-src/packages/d2m/htget.c@ 31853

Last change on this file since 31853 was 10365, checked in by kjdon, 19 years ago

changed my mind, now adding these all individually instead of in a tar file

  • Property svn:keywords set to Author Date Id Revision
File size: 14.2 KB
Line 
1/* ------------------------------------------------------------------- */
2/* htget : Fetch a file using HTTP protocol */
3/* */
4/* Author : Ole Husby, BIBSYS */
5/* Updated : 1998-09-30 */
6/* */
7/* ------------------------------------------------------------------- */
8/* */
9/* htget(url, type, timeout_seconds, outfile, content_type, location) */
10/* */
11/* Returns: HTTP statuscode, with additional private: */
12/* 0 : OK ( = 200) */
13/* 900 : Error, possible timeout */
14/* 901 : Syntax error in url */
15/* 902 : Unknown host */
16/* 903 : No response from server (no connection) */
17/* 904 : File is not text/html */
18/* 905 : Statusline > 255 bytes */
19/* 906 : Statusline < 4 bytes */
20/* 907 : Statusline not starting with "HTTP" */
21/* 908 : Statuscode not numeric */
22/* 909 : Size of header > BUFSIZE */
23/* 910 : Unable to open output file */
24/* 999 : Unspecified TCP/IP error */
25/* */
26/* Writes to outfile, depending on type, if statuscode = 0 | 200 : */
27/* */
28/* type = 0 : Nothing */
29/* type = 1 : HTTP header */
30/* type = 2 : HTTP header + entitybody */
31/* type = 3 : HTTP entitybody */
32/* type = 4 : HTTP entitybody if text/html */
33/* type = 5 : HTTP <HEAD> part of entitybody if text/html */
34/* type = 6 : HTTP <HEAD> part of entitybody if text/html */
35/* HTTP entitybody if application/marc */
36/* */
37/* ------------------------------------------------------------------- */
38
39
40#include <stdlib.h>
41#include <stdio.h>
42#include <string.h>
43#include <ctype.h>
44#include <fcntl.h>
45#include <sys/types.h>
46#include <unistd.h>
47#include <netinet/in.h>
48#include <sys/socket.h>
49#include <netdb.h>
50#include <signal.h>
51
52#define FALSE 0
53#define TRUE 1
54#define BUFSIZE 10000
55#define TYPE_NONE 0
56#define TYPE_HTTPHEAD 1
57#define TYPE_HTTPALL 2
58#define TYPE_HTTPBODY 3
59#define TYPE_HTMLALL 4
60#define TYPE_HTMLHEAD 5
61#define TYPE_MARC 6
62
63#define TRACE 0
64
65#define AGENT "BIBSYS_htget v1.1"
66
67char conType[128];
68int htmlonly, marcrecord;
69
70
71
72void thandler(int i)
73{
74}
75
76
77
78/* ------------------------------------------------------------------- */
79/* geteoHEAD: Look for </HEAD> or <BODY */
80/* */
81/* returns: 0 if not found */
82/* 1 if found. Terminates buf immediately after </HEAD> or */
83/* immediately before <BODY */
84/* ------------------------------------------------------------------- */
85
86int geteoHEAD(char *buf)
87{
88 char *p;
89 int i;
90
91 p = (char *) cstr(buf, "<BODY");
92 if (p)
93 {
94 i = p - buf;
95
96 buf[i] = '\n';
97 buf[i+1] = '\0';
98 return 1;
99 }
100
101 p = (char *) cstr(buf, "</HEAD>");
102 if (p)
103 {
104 i = p - buf + 7;
105
106 buf[i] = '\n';
107 buf[i+1] = '\0';
108 return 1;
109 }
110
111 return 0;
112}
113
114
115
116/* ------------------------------------------------------------------- */
117/* writeRequest: send request to server */
118/* */
119/* returns: Number of bytes written */
120/* ------------------------------------------------------------------- */
121
122int writeRequest(char *req, int server)
123{
124 if (TRACE)
125 printf("*** send(): %s\n", req);
126
127 return write(server, req, strlen(req));
128}
129
130
131
132/* ------------------------------------------------------------------- */
133/* getBody : Read the Entity-body into the file filename */
134/* */
135/* Open the given file for writing, read data from the */
136/* socket until a terminating '\0' is found, write to */
137/* the file. Returns 0 if ok, positive if an error results */
138/* in errno being set, or -1 if other error. */
139
140/* Single read()'s blocking for more than TIMEOUT_SECONDS will */
141/* be interrupted. The read() then returns a negative value, and */
142/* errno will be set appropriately (EINTR). */
143/* */
144/* Returns 0 if ok */
145/* 900 if read error */
146/* */
147/* ------------------------------------------------------------------- */
148
149int getBody(int server, int timeout, int fd, char *filename)
150{
151 int i, ef;
152 unsigned char *bf, buf[BUFSIZE + 1];
153 int bytecount;
154 int found_end;
155
156 if (TRACE)
157 printf("*** Read entitybody\n");
158
159/* Loop until the endmark is found */
160
161 found_end = FALSE;
162
163
164 while (!found_end)
165 {
166 alarm (timeout);
167 bytecount = read (server, buf, BUFSIZE);
168 alarm (0);
169
170 if (bytecount < 0)
171 return 900; /* error in read() */
172
173 else if (bytecount == 0)
174 break; /* server closed socket */
175
176
177 else
178 {
179 if (buf[bytecount-1] == '\0')
180 {
181 bytecount--; /* do not write the '\0' to file */
182 found_end = TRUE; /* terminate the loop */
183 }
184
185/* Write to file */
186
187 if (bytecount > 0)
188 {
189 bf = (unsigned char *) buf;
190 bf[bytecount] = '\0';
191 if ( htmlonly )
192 {
193 ef = geteoHEAD(bf);
194 bytecount = strlen(bf);
195 }
196 else
197 ef = 0;
198 write (fd, bf, bytecount);
199 if (ef)
200 break;
201 }
202 }
203 }
204
205 return 0;
206}
207
208
209
210/* ------------------------------------------------------------------- */
211/* getHeader : Access file and read HTTP header */
212/* */
213/* Returns 0 if ok */
214/* 904 if Content-type not "text/html" */
215/* 905 if Statusline > 255 bytes */
216/* 906 if Statusline < 4 bytes */
217/* 907 if Statusline not starting with "HTTP" */
218/* 908 if Statuscode not numeric */
219/* 909 if size of header > BUFSIZE */
220/* HTTP statuscode if <> 200 */
221/* */
222/* ------------------------------------------------------------------- */
223
224int getHeader(int server, int timeout, int type, int fd, char *reason, char *loc)
225{
226 char buf[BUFSIZE+1], dummy[8];
227 unsigned char *p, *cp = buf, *d = dummy;
228 int i, rc, statuscode, plf, rlen = 0;
229
230 reason = (char *) NULL;
231 *conType = 0;
232
233 if (TRACE)
234 printf("*** Read statusline\n");
235
236
237/* Read HTTP statusline (until LF or 0, max 256 byte) */
238
239 for ( cp = buf, *buf = 0; 1; cp++ )
240 {
241 alarm (timeout);
242 rc = read ( server, cp, 1 );
243 alarm (0);
244
245 if ( rc < 0 )
246 return 900;
247
248 if ( *cp == '\r')
249 {
250 cp--;
251 continue;
252 }
253
254 rlen++;
255
256 if (TRACE)
257 printf("%c", *cp);
258
259 if ( *cp == 0 || rlen > 255)
260 return 905;
261 if ( *cp == '\n')
262 {
263 *cp = 0;
264 break;
265 }
266 }
267
268/* Write statusline */
269
270 if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) )
271 {
272 write(fd, buf, strlen(buf));
273 write(fd, "\n\n", 2);
274 }
275
276 if (TRACE)
277 printf("*** read() : (%d) %s\n", rlen, buf);
278
279
280/* Parse statusline */
281
282 if (rlen < 4)
283 return 906;
284
285 if (strncmp(buf, "HTTP", 4) != 0)
286 return 907;
287
288 p = strtok(buf, " ");
289 p = strtok(NULL, " ");
290
291 if (!*p)
292 statuscode = 200;
293 else
294 {
295 for (i = 0; i < strlen(p); i++)
296 if (!isdigit(p[i]))
297 return 908;
298 }
299 statuscode = atoi(p);
300
301 p = strtok(NULL, "\0");
302 if (p)
303 reason = p;
304
305 if (statuscode == 200)
306 statuscode = 0;
307
308 if (!type)
309 return statuscode;
310
311
312/* Read HTTP response header (until 0 or empty line, max BUFSIZE bytes */
313
314 rlen = 0;
315 plf = FALSE;
316
317 if (TRACE)
318 printf("*** Read responseheader\n");
319
320 for ( cp = buf, *buf = 0; 1; cp++ )
321 {
322 alarm (timeout);
323 rc = read ( server, cp, 1 );
324 alarm (0);
325
326 if ( rc < 0 )
327 return 900;
328
329 if ( *cp == '\r')
330 {
331 cp--;
332 continue;
333 }
334
335 rlen++;
336
337 if ( *cp == 0 || rlen > BUFSIZE)
338 return 909;
339 else if ( *cp == '\n')
340 {
341 if (plf)
342 {
343 *cp = 0;
344 break;
345 }
346 else
347 plf = TRUE;
348 }
349 else
350 plf = FALSE;
351 }
352
353/* Write rest of HTTP header */
354
355 if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) )
356 {
357 write(fd, buf, strlen(buf));
358 write(fd, "\n", 1);
359 }
360
361 if (TRACE)
362 printf("*** read() : (%d) %s\n", rlen, buf);
363
364
365/* Parse header for Content-Type and Loaction */
366
367 rc = 904;
368
369 p = strtok(buf, "\n");
370 while (p)
371 {
372 if (strncasecmp(p, "Content-Type:", 13) == 0)
373 {
374 p += 13;
375 while (p[0] == ' ')
376 p++;
377 strcpy(conType, p);
378
379 if (strncasecmp(p, "text/html", 7) == 0)
380 {
381 if ( ( type == TYPE_HTMLHEAD) || (type == TYPE_MARC ) )
382 htmlonly = 1;
383 rc = 0;
384 }
385
386 else if (strncasecmp(p, "application/marc", 16) == 0)
387 {
388 if ( type == TYPE_MARC )
389 {
390 marcrecord = 1;
391 rc = 0;
392 }
393 }
394 }
395
396 else if (strncasecmp(p, "Location:", 9) == 0)
397 {
398 p += 9;
399 while (p[0] == ' ')
400 p++;
401 strcpy(loc, p);
402 }
403 p = strtok(NULL, "\n");
404 }
405
406
407/* All OK. Socket is positioned at start of HTTP Entity-Body */
408
409 if (rc)
410 return rc;
411 else
412 return statuscode;
413}
414
415
416
417
418
419/* ------------------------------------------------------------------- */
420/* htget : Fetch a URL */
421/* ------------------------------------------------------------------- */
422
423int htget(char *iurl, int type, int timeout, char *outfile, char *h_contype, char *h_location)
424{
425 int i, rc, fd, soc, port;
426 struct sockaddr_in addr;
427 struct hostent *hp, *gethostbyname();
428 char uurl[1024], *url = uurl, hostname[256], cport[64], req[1024];
429 char *h, *p, *q, *r;
430 char tfile[256], blank[2];
431
432 *blank = *h_contype = *h_location = 0;
433 htmlonly = marcrecord = 0;
434
435 strcpy(tfile, "/tmp/geturl.tmp");
436 if (!*outfile)
437 outfile = (char *) tfile;
438
439 if (!*iurl || ( strlen(iurl) > 1023 ) )
440 return 901;
441
442 strcpy(url, iurl);
443
444/* Parse and validate URL */
445
446 if (strncmp(url, "http://", 7) != 0)
447 return 901;
448
449 url += 7;
450
451 q = strtok(url, "/");
452 if (!q)
453 return 901;
454
455 r = strtok(NULL, "\0");
456 if (!r)
457 r = (char *) blank;
458
459 h = strtok(q, ":");
460 if (!*h)
461 return 901;
462
463 strcpy(hostname, h);
464
465 p = strtok(NULL, "\0");
466
467 if (!p || !*p)
468 port = 80;
469 else
470 {
471 for (i = 0; i < strlen(p); i++)
472 if (!isdigit(p[i]))
473 return 901;
474 port = atoi(p);
475 }
476 sprintf(req, "GET /%s HTTP/1.0\r\nUser_Agent: %s\n\n", r, AGENT);
477
478
479/* Establish handler for the alarm-signal */
480
481 signal (SIGALRM, thandler);
482
483
484/* Get IP address */
485
486 hp = gethostbyname(hostname);
487 if (!hp)
488 return 902;
489
490/* Get socket and connect */
491
492 soc = socket(AF_INET, SOCK_STREAM, 0);
493
494 addr.sin_family = AF_INET;
495
496 memcpy( &addr.sin_addr.s_addr, hp->h_addr, (size_t) hp->h_length );
497
498 addr.sin_port = htons(port);
499 if (connect(soc, (struct sockaddr *) &addr, sizeof(struct sockaddr_in)) < 0)
500 return 903;
501
502
503/* Open the destination file */
504
505 if (type)
506 {
507 fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC , 0666);
508 if (fd < 0)
509 {
510 close(soc);
511 return 910;
512 }
513 }
514
515
516
517
518/* Write HTTP-request */
519
520 if (!writeRequest(req, soc))
521 {
522 close(soc);
523 if (type)
524 close(fd);
525 return 999;
526 }
527
528
529/* Read header part of response */
530
531 rc = getHeader(soc, timeout, type, fd, r, h_location);
532
533 if ( ( rc == 904 ) && (type < 4 ) )
534 rc = 0;
535
536 if ( ( rc == 0 ) && ( type > 1 ) )
537 rc = getBody(soc, timeout, fd, outfile);
538
539 close(soc);
540 if (type)
541 close(fd);
542 strcpy(h_contype, conType);
543
544 return rc;
545}
Note: See TracBrowser for help on using the repository browser.