source: main/trunk/greenstone2/perllib/cpan/Mojo/Message.pm

Last change on this file was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File size: 16.9 KB
Line 
1package Mojo::Message;
2use Mojo::Base 'Mojo::EventEmitter';
3
4use Carp 'croak';
5use Mojo::Asset::Memory;
6use Mojo::Content::Single;
7use Mojo::DOM;
8use Mojo::JSON 'j';
9use Mojo::JSON::Pointer;
10use Mojo::Parameters;
11use Mojo::Upload;
12use Mojo::Util 'decode';
13
14has content => sub { Mojo::Content::Single->new };
15has default_charset => 'UTF-8';
16has max_line_size => sub { $ENV{MOJO_MAX_LINE_SIZE} || 8192 };
17has max_message_size => sub { $ENV{MOJO_MAX_MESSAGE_SIZE} // 16777216 };
18has version => '1.1';
19
20sub body {
21 my $self = shift;
22
23 # Get
24 my $content = $self->content;
25 return $content->is_multipart ? '' : $content->asset->slurp unless @_;
26
27 # Set (multipart content needs to be downgraded)
28 $content = $self->content(Mojo::Content::Single->new)->content
29 if $content->is_multipart;
30 $content->asset(Mojo::Asset::Memory->new->add_chunk(@_));
31
32 return $self;
33}
34
35sub body_params {
36 my $self = shift;
37
38 return $self->{body_params} if $self->{body_params};
39 my $params = $self->{body_params} = Mojo::Parameters->new;
40 $params->charset($self->content->charset || $self->default_charset);
41
42 # "application/x-www-form-urlencoded"
43 my $type = $self->headers->content_type // '';
44 if ($type =~ m!application/x-www-form-urlencoded!i) {
45 $params->parse($self->content->asset->slurp);
46 }
47
48 # "multipart/form-data"
49 elsif ($type =~ m!multipart/form-data!i) {
50 $params->append(@$_[0, 1]) for @{$self->_parse_formdata};
51 }
52
53 return $params;
54}
55
56sub body_size { shift->content->body_size }
57
58sub build_body { shift->_build('get_body_chunk') }
59sub build_headers { shift->_build('get_header_chunk') }
60sub build_start_line { shift->_build('get_start_line_chunk') }
61
62sub cookie { shift->_cache('cookies', 0, @_) }
63
64sub cookies { croak 'Method "cookies" not implemented by subclass' }
65
66sub dom {
67 my $self = shift;
68 return undef if $self->content->is_multipart;
69 my $dom = $self->{dom} ||= Mojo::DOM->new($self->text);
70 return @_ ? $dom->find(@_) : $dom;
71}
72
73sub error {
74 my $self = shift;
75 return $self->{error} unless @_;
76 $self->{error} = shift;
77 return $self->finish;
78}
79
80sub every_cookie { shift->_cache('cookies', 1, @_) }
81sub every_upload { shift->_cache('uploads', 1, @_) }
82
83sub extract_start_line {
84 croak 'Method "extract_start_line" not implemented by subclass';
85}
86
87sub finish {
88 my $self = shift;
89 $self->{state} = 'finished';
90 return $self->{finished}++ ? $self : $self->emit('finish');
91}
92
93sub fix_headers {
94 my $self = shift;
95 return $self if $self->{fix}++;
96
97 # Content-Length or Connection (unless chunked transfer encoding is used)
98 my $content = $self->content;
99 my $headers = $content->headers;
100 if ($content->is_multipart) { $headers->remove('Content-Length') }
101 elsif ($content->is_chunked || $headers->content_length) { return $self }
102 if ($content->is_dynamic) { $headers->connection('close') }
103 else { $headers->content_length($self->body_size) }
104
105 return $self;
106}
107
108sub get_body_chunk {
109 my ($self, $offset) = @_;
110
111 $self->emit('progress', 'body', $offset);
112 my $chunk = $self->content->get_body_chunk($offset);
113 return $chunk if !defined $chunk || length $chunk;
114 $self->finish;
115
116 return $chunk;
117}
118
119sub get_header_chunk {
120 my ($self, $offset) = @_;
121 $self->emit('progress', 'headers', $offset);
122 return $self->fix_headers->content->get_header_chunk($offset);
123}
124
125sub get_start_line_chunk {
126 croak 'Method "get_start_line_chunk" not implemented by subclass';
127}
128
129sub header_size { shift->fix_headers->content->header_size }
130
131sub headers { shift->content->headers }
132
133sub is_finished { (shift->{state} // '') eq 'finished' }
134
135sub is_limit_exceeded { !!shift->{limit} }
136
137sub json {
138 my ($self, $pointer) = @_;
139 return undef if $self->content->is_multipart;
140 my $data = $self->{json} //= j($self->body);
141 return $pointer ? Mojo::JSON::Pointer->new($data)->get($pointer) : $data;
142}
143
144sub parse {
145 my ($self, $chunk) = @_;
146
147 return $self if $self->{error};
148 $self->{raw_size} += length $chunk;
149 $self->{buffer} .= $chunk;
150
151 # Start-line
152 unless ($self->{state}) {
153
154 # Check start-line size
155 my $len = index $self->{buffer}, "\x0a";
156 $len = length $self->{buffer} if $len < 0;
157 return $self->_limit('Maximum start-line size exceeded')
158 if $len > $self->max_line_size;
159
160 $self->{state} = 'content' if $self->extract_start_line(\$self->{buffer});
161 }
162
163 # Content
164 my $state = $self->{state} // '';
165 $self->content($self->content->parse(delete $self->{buffer}))
166 if $state eq 'content' || $state eq 'finished';
167
168 # Check message size
169 my $max = $self->max_message_size;
170 return $self->_limit('Maximum message size exceeded')
171 if $max && $max < $self->{raw_size};
172
173 # Check header size
174 return $self->_limit('Maximum header size exceeded')
175 if $self->headers->is_limit_exceeded;
176
177 # Check buffer size
178 return $self->_limit('Maximum buffer size exceeded')
179 if $self->content->is_limit_exceeded;
180
181 return $self->emit('progress')->content->is_finished ? $self->finish : $self;
182}
183
184sub start_line_size {
185 croak 'Method "start_line_size" not implemented by subclass';
186}
187
188sub text {
189 my $self = shift;
190 my $body = $self->body;
191 my $charset = $self->content->charset || $self->default_charset;
192 return $charset ? decode($charset, $body) // $body : $body;
193}
194
195sub to_string {
196 my $self = shift;
197 return $self->build_start_line . $self->build_headers . $self->build_body;
198}
199
200sub upload { shift->_cache('uploads', 0, @_) }
201
202sub uploads {
203 my $self = shift;
204
205 my @uploads;
206 for my $data (@{$self->_parse_formdata(1)}) {
207 my $upload = Mojo::Upload->new(
208 name => $data->[0],
209 filename => $data->[2],
210 asset => $data->[1]->asset,
211 headers => $data->[1]->headers
212 );
213 push @uploads, $upload;
214 }
215
216 return \@uploads;
217}
218
219sub _build {
220 my ($self, $method) = @_;
221
222 my ($buffer, $offset) = ('', 0);
223 while (1) {
224
225 # No chunk yet, try again
226 next unless defined(my $chunk = $self->$method($offset));
227
228 # End of part
229 last unless my $len = length $chunk;
230
231 $offset += $len;
232 $buffer .= $chunk;
233 }
234
235 return $buffer;
236}
237
238sub _cache {
239 my ($self, $method, $all, $name) = @_;
240
241 # Cache objects by name
242 unless ($self->{$method}) {
243 $self->{$method} = {};
244 push @{$self->{$method}{$_->name}}, $_ for @{$self->$method};
245 }
246
247 my $objects = $self->{$method}{$name} || [];
248 return $all ? $objects : $objects->[-1];
249}
250
251sub _limit { ++$_[0]{limit} and return $_[0]->error({message => $_[1]}) }
252
253sub _parse_formdata {
254 my ($self, $upload) = @_;
255
256 my @formdata;
257 my $content = $self->content;
258 return \@formdata unless $content->is_multipart;
259 my $charset = $content->charset || $self->default_charset;
260
261 # Check all parts recursively
262 my @parts = ($content);
263 while (my $part = shift @parts) {
264
265 if ($part->is_multipart) {
266 unshift @parts, @{$part->parts};
267 next;
268 }
269
270 next unless my $disposition = $part->headers->content_disposition;
271 my ($filename) = $disposition =~ /[; ]filename="((?:\\"|[^"])*)"/;
272 next if $upload && !defined $filename || !$upload && defined $filename;
273 my ($name) = $disposition =~ /[; ]name="((?:\\"|[^;"])*)"/;
274 $part = $part->asset->slurp unless $upload;
275
276 if ($charset) {
277 $name = decode($charset, $name) // $name if $name;
278 $filename = decode($charset, $filename) // $filename if $filename;
279 $part = decode($charset, $part) // $part unless $upload;
280 }
281
282 push @formdata, [$name, $part, $filename];
283 }
284
285 return \@formdata;
286}
287
2881;
289
290=encoding utf8
291
292=head1 NAME
293
294Mojo::Message - HTTP message base class
295
296=head1 SYNOPSIS
297
298 package Mojo::Message::MyMessage;
299 use Mojo::Base 'Mojo::Message';
300
301 sub cookies {...}
302 sub extract_start_line {...}
303 sub get_start_line_chunk {...}
304 sub start_line_size {...}
305
306=head1 DESCRIPTION
307
308L<Mojo::Message> is an abstract base class for HTTP message containers, based on
309L<RFC 7230|http://tools.ietf.org/html/rfc7230>,
310L<RFC 7231|http://tools.ietf.org/html/rfc7231> and
311L<RFC 2388|http://tools.ietf.org/html/rfc2388>, like L<Mojo::Message::Request>
312and L<Mojo::Message::Response>.
313
314=head1 EVENTS
315
316L<Mojo::Message> inherits all events from L<Mojo::EventEmitter> and can emit
317the following new ones.
318
319=head2 finish
320
321 $msg->on(finish => sub {
322 my $msg = shift;
323 ...
324 });
325
326Emitted after message building or parsing is finished.
327
328 my $before = time;
329 $msg->on(finish => sub {
330 my $msg = shift;
331 $msg->headers->header('X-Parser-Time' => time - $before);
332 });
333
334=head2 progress
335
336 $msg->on(progress => sub {
337 my $msg = shift;
338 ...
339 });
340
341Emitted when message building or parsing makes progress.
342
343 # Building
344 $msg->on(progress => sub {
345 my ($msg, $state, $offset) = @_;
346 say qq{Building "$state" at offset $offset};
347 });
348
349 # Parsing
350 $msg->on(progress => sub {
351 my $msg = shift;
352 return unless my $len = $msg->headers->content_length;
353 my $size = $msg->content->progress;
354 say 'Progress: ', $size == $len ? 100 : int($size / ($len / 100)), '%';
355 });
356
357=head1 ATTRIBUTES
358
359L<Mojo::Message> implements the following attributes.
360
361=head2 content
362
363 my $msg = $msg->content;
364 $msg = $msg->content(Mojo::Content::Single->new);
365
366Message content, defaults to a L<Mojo::Content::Single> object.
367
368=head2 default_charset
369
370 my $charset = $msg->default_charset;
371 $msg = $msg->default_charset('UTF-8');
372
373Default charset used by L</"text"> and to extract data from
374C<application/x-www-form-urlencoded> or C<multipart/form-data> message body,
375defaults to C<UTF-8>.
376
377=head2 max_line_size
378
379 my $size = $msg->max_line_size;
380 $msg = $msg->max_line_size(1024);
381
382Maximum start-line size in bytes, defaults to the value of the
383C<MOJO_MAX_LINE_SIZE> environment variable or C<8192> (8KiB).
384
385=head2 max_message_size
386
387 my $size = $msg->max_message_size;
388 $msg = $msg->max_message_size(1024);
389
390Maximum message size in bytes, defaults to the value of the
391C<MOJO_MAX_MESSAGE_SIZE> environment variable or C<16777216> (16MiB). Setting
392the value to C<0> will allow messages of indefinite size.
393
394=head2 version
395
396 my $version = $msg->version;
397 $msg = $msg->version('1.1');
398
399HTTP version of message, defaults to C<1.1>.
400
401=head1 METHODS
402
403L<Mojo::Message> inherits all methods from L<Mojo::EventEmitter> and implements
404the following new ones.
405
406=head2 body
407
408 my $bytes = $msg->body;
409 $msg = $msg->body('Hello!');
410
411Slurp or replace L</"content">.
412
413=head2 body_params
414
415 my $params = $msg->body_params;
416
417C<POST> parameters extracted from C<application/x-www-form-urlencoded> or
418C<multipart/form-data> message body, usually a L<Mojo::Parameters> object. Note
419that this method caches all data, so it should not be called before the entire
420message body has been received. Parts of the message body need to be loaded
421into memory to parse C<POST> parameters, so you have to make sure it is not
422excessively large. There's a 16MiB limit for requests and a 2GiB limit for
423responses by default.
424
425 # Get POST parameter names and values
426 my $hash = $msg->body_params->to_hash;
427
428=head2 body_size
429
430 my $size = $msg->body_size;
431
432Content size in bytes.
433
434=head2 build_body
435
436 my $bytes = $msg->build_body;
437
438Render whole body with L</"get_body_chunk">.
439
440=head2 build_headers
441
442 my $bytes = $msg->build_headers;
443
444Render all headers with L</"get_header_chunk">.
445
446=head2 build_start_line
447
448 my $bytes = $msg->build_start_line;
449
450Render start-line with L</"get_start_line_chunk">.
451
452=head2 cookie
453
454 my $cookie = $msg->cookie('foo');
455
456Access message cookies, usually L<Mojo::Cookie::Request> or
457L<Mojo::Cookie::Response> objects. If there are multiple cookies sharing the
458same name, and you want to access more than just the last one, you can use
459L</"every_cookie">. Note that this method caches all data, so it should not be
460called before all headers have been received.
461
462 # Get cookie value
463 say $msg->cookie('foo')->value;
464
465=head2 cookies
466
467 my $cookies = $msg->cookies;
468
469Access message cookies. Meant to be overloaded in a subclass.
470
471=head2 dom
472
473 my $dom = $msg->dom;
474 my $collection = $msg->dom('a[href]');
475
476Retrieve message body from L</"text"> and turn it into a L<Mojo::DOM> object,
477an optional selector can be used to call the method L<Mojo::DOM/"find"> on it
478right away, which then returns a L<Mojo::Collection> object. Note that this
479method caches all data, so it should not be called before the entire message
480body has been received. The whole message body needs to be loaded into memory
481to parse it, so you have to make sure it is not excessively large. There's a
48216MiB limit for requests and a 2GiB limit for responses by default.
483
484 # Perform "find" right away
485 say $msg->dom('h1, h2, h3')->map('text')->join("\n");
486
487 # Use everything else Mojo::DOM has to offer
488 say $msg->dom->at('title')->text;
489 say $msg->dom->at('body')->children->map('tag')->uniq->join("\n");
490
491=head2 error
492
493 my $err = $msg->error;
494 $msg = $msg->error({message => 'Parser error'});
495
496Get or set message error, an C<undef> return value indicates that there is no
497error.
498
499 # Connection or parser error
500 $msg->error({message => 'Connection refused'});
501
502 # 4xx/5xx response
503 $msg->error({message => 'Internal Server Error', code => 500});
504
505=head2 every_cookie
506
507 my $cookies = $msg->every_cookie('foo');
508
509Similar to L</"cookie">, but returns all message cookies sharing the same name
510as an array reference.
511
512 # Get first cookie value
513 say $msg->every_cookie('foo')->[0]->value;
514
515=head2 every_upload
516
517 my $uploads = $msg->every_upload('foo');
518
519Similar to L</"upload">, but returns all file uploads sharing the same name as
520an array reference.
521
522 # Get content of first uploaded file
523 say $msg->every_upload('foo')->[0]->asset->slurp;
524
525=head2 extract_start_line
526
527 my $bool = $msg->extract_start_line(\$str);
528
529Extract start-line from string. Meant to be overloaded in a subclass.
530
531=head2 finish
532
533 $msg = $msg->finish;
534
535Finish message parser/generator.
536
537=head2 fix_headers
538
539 $msg = $msg->fix_headers;
540
541Make sure message has all required headers.
542
543=head2 get_body_chunk
544
545 my $bytes = $msg->get_body_chunk($offset);
546
547Get a chunk of body data starting from a specific position. Note that it might
548not be possible to get the same chunk twice if content was generated
549dynamically.
550
551=head2 get_header_chunk
552
553 my $bytes = $msg->get_header_chunk($offset);
554
555Get a chunk of header data, starting from a specific position. Note that this
556method finalizes the message.
557
558=head2 get_start_line_chunk
559
560 my $bytes = $msg->get_start_line_chunk($offset);
561
562Get a chunk of start-line data starting from a specific position. Meant to be
563overloaded in a subclass.
564
565=head2 header_size
566
567 my $size = $msg->header_size;
568
569Size of headers in bytes. Note that this method finalizes the message.
570
571=head2 headers
572
573 my $headers = $msg->headers;
574
575Message headers, usually a L<Mojo::Headers> object.
576
577 # Longer version
578 my $headers = $msg->content->headers;
579
580=head2 is_finished
581
582 my $bool = $msg->is_finished;
583
584Check if message parser/generator is finished.
585
586=head2 is_limit_exceeded
587
588 my $bool = $msg->is_limit_exceeded;
589
590Check if message has exceeded L</"max_line_size">, L</"max_message_size">,
591L<Mojo::Content/"max_buffer_size"> or L<Mojo::Headers/"max_line_size">.
592
593=head2 json
594
595 my $value = $msg->json;
596 my $value = $msg->json('/foo/bar');
597
598Decode JSON message body directly using L<Mojo::JSON> if possible, an C<undef>
599return value indicates a bare C<null> or that decoding failed. An optional JSON
600Pointer can be used to extract a specific value with L<Mojo::JSON::Pointer>.
601Note that this method caches all data, so it should not be called before the
602entire message body has been received. The whole message body needs to be
603loaded into memory to parse it, so you have to make sure it is not excessively
604large. There's a 16MiB limit for requests and a 2GiB limit for responses by
605default.
606
607 # Extract JSON values
608 say $msg->json->{foo}{bar}[23];
609 say $msg->json('/foo/bar/23');
610
611=head2 parse
612
613 $msg = $msg->parse('HTTP/1.1 200 OK...');
614
615Parse message chunk.
616
617=head2 start_line_size
618
619 my $size = $msg->start_line_size;
620
621Size of the start-line in bytes. Meant to be overloaded in a subclass.
622
623=head2 text
624
625 my $str = $msg->text;
626
627Retrieve L</"body"> and try to decode it with L<Mojo::Content/"charset"> or
628L</"default_charset">.
629
630=head2 to_string
631
632 my $str = $msg->to_string;
633
634Render whole message. Note that this method finalizes the message, and that it
635might not be possible to render the same message twice if content was generated
636dynamically.
637
638=head2 upload
639
640 my $upload = $msg->upload('foo');
641
642Access C<multipart/form-data> file uploads, usually L<Mojo::Upload> objects. If
643there are multiple uploads sharing the same name, and you want to access more
644than just the last one, you can use L</"every_upload">. Note that this method
645caches all data, so it should not be called before the entire message body has
646been received.
647
648 # Get content of uploaded file
649 say $msg->upload('foo')->asset->slurp;
650
651=head2 uploads
652
653 my $uploads = $msg->uploads;
654
655All C<multipart/form-data> file uploads, usually L<Mojo::Upload> objects.
656
657 # Names of all uploads
658 say $_->name for @{$msg->uploads};
659
660=head1 SEE ALSO
661
662L<Mojolicious>, L<Mojolicious::Guides>, L<https://mojolicious.org>.
663
664=cut
Note: See TracBrowser for help on using the repository browser.