source: main/trunk/greenstone2/perllib/cpan/Mojo/Content.pm@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File size: 14.7 KB
Line 
1package Mojo::Content;
2use Mojo::Base 'Mojo::EventEmitter';
3
4use Carp 'croak';
5use Compress::Raw::Zlib qw(WANT_GZIP Z_STREAM_END);
6use Mojo::Headers;
7use Scalar::Util 'looks_like_number';
8
9has [qw(auto_decompress auto_relax expect_close relaxed skip_body)];
10has headers => sub { Mojo::Headers->new };
11has max_buffer_size => sub { $ENV{MOJO_MAX_BUFFER_SIZE} || 262144 };
12has max_leftover_size => sub { $ENV{MOJO_MAX_LEFTOVER_SIZE} || 262144 };
13
14my $BOUNDARY_RE
15 = qr!multipart.*boundary\s*=\s*(?:"([^"]+)"|([\w'(),.:?\-+/]+))!i;
16
17sub body_contains {
18 croak 'Method "body_contains" not implemented by subclass';
19}
20
21sub body_size { croak 'Method "body_size" not implemented by subclass' }
22
23sub boundary {
24 (shift->headers->content_type // '') =~ $BOUNDARY_RE ? $1 // $2 : undef;
25}
26
27sub charset {
28 my $type = shift->headers->content_type // '';
29 return $type =~ /charset\s*=\s*"?([^"\s;]+)"?/i ? $1 : undef;
30}
31
32sub clone {
33 my $self = shift;
34 return undef if $self->is_dynamic;
35 return $self->new(headers => $self->headers->clone);
36}
37
38sub generate_body_chunk {
39 my ($self, $offset) = @_;
40
41 $self->emit(drain => $offset) unless length($self->{body_buffer} //= '');
42 my $len = $self->headers->content_length;
43 return '' if looks_like_number $len && $len == $offset;
44 my $chunk = delete $self->{body_buffer};
45 return $self->{eof} ? '' : undef unless length $chunk;
46
47 return $chunk;
48}
49
50sub get_body_chunk {
51 croak 'Method "get_body_chunk" not implemented by subclass';
52}
53
54sub get_header_chunk { substr shift->_headers->{header_buffer}, shift, 131072 }
55
56sub header_size { length shift->_headers->{header_buffer} }
57
58sub headers_contain { index(shift->_headers->{header_buffer}, shift) >= 0 }
59
60sub is_chunked { !!shift->headers->transfer_encoding }
61
62sub is_compressed { lc(shift->headers->content_encoding // '') eq 'gzip' }
63
64sub is_dynamic { !!$_[0]{dynamic} }
65
66sub is_finished { (shift->{state} // '') eq 'finished' }
67
68sub is_limit_exceeded { !!shift->{limit} }
69
70sub is_multipart {undef}
71
72sub is_parsing_body { (shift->{state} // '') eq 'body' }
73
74sub leftovers { shift->{buffer} }
75
76sub parse {
77 my $self = shift;
78
79 # Headers
80 $self->_parse_until_body(@_);
81 return $self if $self->{state} eq 'headers';
82
83 # Chunked content
84 $self->{real_size} //= 0;
85 if ($self->is_chunked && $self->{state} ne 'headers') {
86 $self->_parse_chunked;
87 $self->{state} = 'finished' if ($self->{chunk_state} // '') eq 'finished';
88 }
89
90 # Not chunked, pass through to second buffer
91 else {
92 $self->{real_size} += length $self->{pre_buffer};
93 my $limit = $self->is_finished
94 && length($self->{buffer}) > $self->max_leftover_size;
95 $self->{buffer} .= $self->{pre_buffer} unless $limit;
96 $self->{pre_buffer} = '';
97 }
98
99 # No content
100 if ($self->skip_body) {
101 $self->{state} = 'finished';
102 return $self;
103 }
104
105 # Relaxed parsing
106 my $headers = $self->headers;
107 my $len = $headers->content_length // '';
108 if ($self->auto_relax && !length $len) {
109 my $connection = lc($headers->connection // '');
110 $self->relaxed(1)
111 if $connection eq 'close' || (!$connection && $self->expect_close);
112 }
113
114 # Chunked or relaxed content
115 if ($self->is_chunked || $self->relaxed) {
116 $self->_decompress($self->{buffer} //= '');
117 $self->{size} += length $self->{buffer};
118 $self->{buffer} = '';
119 return $self;
120 }
121
122 # Normal content
123 $len = 0 unless looks_like_number $len;
124 if ((my $need = $len - ($self->{size} ||= 0)) > 0) {
125 my $len = length $self->{buffer};
126 my $chunk = substr $self->{buffer}, 0, $need > $len ? $len : $need, '';
127 $self->_decompress($chunk);
128 $self->{size} += length $chunk;
129 }
130 $self->{state} = 'finished' if $len <= $self->progress;
131
132 return $self;
133}
134
135sub parse_body {
136 my $self = shift;
137 $self->{state} = 'body';
138 return $self->parse(@_);
139}
140
141sub progress {
142 my $self = shift;
143 return 0 unless my $state = $self->{state};
144 return 0 unless $state eq 'body' || $state eq 'finished';
145 return $self->{raw_size} - ($self->{header_size} || 0);
146}
147
148sub write {
149 my ($self, $chunk, $cb) = @_;
150
151 $self->{dynamic} = 1;
152 $self->{body_buffer} .= $chunk if defined $chunk;
153 $self->once(drain => $cb) if $cb;
154 $self->{eof} = 1 if defined $chunk && !length $chunk;
155
156 return $self;
157}
158
159sub write_chunk {
160 my ($self, $chunk, $cb) = @_;
161 $self->headers->transfer_encoding('chunked') unless $self->is_chunked;
162 $self->write(defined $chunk ? $self->_build_chunk($chunk) : $chunk, $cb);
163 $self->{eof} = 1 if defined $chunk && !length $chunk;
164 return $self;
165}
166
167sub _build_chunk {
168 my ($self, $chunk) = @_;
169
170 # End
171 return "\x0d\x0a0\x0d\x0a\x0d\x0a" unless length $chunk;
172
173 # First chunk has no leading CRLF
174 my $crlf = $self->{chunks}++ ? "\x0d\x0a" : '';
175 return $crlf . sprintf('%x', length $chunk) . "\x0d\x0a$chunk";
176}
177
178sub _decompress {
179 my ($self, $chunk) = @_;
180
181 # No compression
182 return $self->emit(read => $chunk)
183 unless $self->auto_decompress && $self->is_compressed;
184
185 # Decompress
186 $self->{post_buffer} .= $chunk;
187 my $gz = $self->{gz}
188 //= Compress::Raw::Zlib::Inflate->new(WindowBits => WANT_GZIP);
189 my $status = $gz->inflate(\$self->{post_buffer}, my $out);
190 $self->emit(read => $out) if defined $out;
191
192 # Replace Content-Encoding with Content-Length
193 $self->headers->content_length($gz->total_out)->remove('Content-Encoding')
194 if $status == Z_STREAM_END;
195
196 # Check buffer size
197 @$self{qw(state limit)} = ('finished', 1)
198 if length($self->{post_buffer} // '') > $self->max_buffer_size;
199}
200
201sub _headers {
202 my $self = shift;
203 return $self if defined $self->{header_buffer};
204 my $headers = $self->headers->to_string;
205 $self->{header_buffer} = $headers ? "$headers\x0d\x0a\x0d\x0a" : "\x0d\x0a";
206 return $self;
207}
208
209sub _parse_chunked {
210 my $self = shift;
211
212 # Trailing headers
213 return $self->_parse_chunked_trailing_headers
214 if ($self->{chunk_state} // '') eq 'trailing_headers';
215
216 while (my $len = length $self->{pre_buffer}) {
217
218 # Start new chunk (ignore the chunk extension)
219 unless ($self->{chunk_len}) {
220 last
221 unless $self->{pre_buffer} =~ s/^(?:\x0d?\x0a)?([0-9a-fA-F]+).*\x0a//;
222 next if $self->{chunk_len} = hex $1;
223
224 # Last chunk
225 $self->{chunk_state} = 'trailing_headers';
226 last;
227 }
228
229 # Remove as much as possible from payload
230 $len = $self->{chunk_len} if $self->{chunk_len} < $len;
231 $self->{buffer} .= substr $self->{pre_buffer}, 0, $len, '';
232 $self->{real_size} += $len;
233 $self->{chunk_len} -= $len;
234 }
235
236 # Trailing headers
237 $self->_parse_chunked_trailing_headers
238 if ($self->{chunk_state} // '') eq 'trailing_headers';
239
240 # Check buffer size
241 @$self{qw(state limit)} = ('finished', 1)
242 if length($self->{pre_buffer} // '') > $self->max_buffer_size;
243}
244
245sub _parse_chunked_trailing_headers {
246 my $self = shift;
247
248 my $headers = $self->headers->parse(delete $self->{pre_buffer});
249 return unless $headers->is_finished;
250 $self->{chunk_state} = 'finished';
251
252 # Take care of leftover and replace Transfer-Encoding with Content-Length
253 $self->{buffer} .= $headers->leftovers;
254 $headers->remove('Transfer-Encoding');
255 $headers->content_length($self->{real_size}) unless $headers->content_length;
256}
257
258sub _parse_headers {
259 my $self = shift;
260
261 my $headers = $self->headers->parse(delete $self->{pre_buffer});
262 return unless $headers->is_finished;
263 $self->{state} = 'body';
264
265 # Take care of leftovers
266 my $leftovers = $self->{pre_buffer} = $headers->leftovers;
267 $self->{header_size} = $self->{raw_size} - length $leftovers;
268}
269
270sub _parse_until_body {
271 my ($self, $chunk) = @_;
272
273 $self->{raw_size} += length($chunk //= '');
274 $self->{pre_buffer} .= $chunk;
275 $self->_parse_headers if ($self->{state} ||= 'headers') eq 'headers';
276 $self->emit('body') if $self->{state} ne 'headers' && !$self->{body}++;
277}
278
2791;
280
281=encoding utf8
282
283=head1 NAME
284
285Mojo::Content - HTTP content base class
286
287=head1 SYNOPSIS
288
289 package Mojo::Content::MyContent;
290 use Mojo::Base 'Mojo::Content';
291
292 sub body_contains {...}
293 sub body_size {...}
294 sub get_body_chunk {...}
295
296=head1 DESCRIPTION
297
298L<Mojo::Content> is an abstract base class for HTTP content containers, based on
299L<RFC 7230|http://tools.ietf.org/html/rfc7230> and
300L<RFC 7231|http://tools.ietf.org/html/rfc7231>, like
301L<Mojo::Content::MultiPart> and L<Mojo::Content::Single>.
302
303=head1 EVENTS
304
305L<Mojo::Content> inherits all events from L<Mojo::EventEmitter> and can emit
306the following new ones.
307
308=head2 body
309
310 $content->on(body => sub {
311 my $content = shift;
312 ...
313 });
314
315Emitted once all headers have been parsed and the body starts.
316
317 $content->on(body => sub {
318 my $content = shift;
319 $content->auto_upgrade(0) if $content->headers->header('X-No-MultiPart');
320 });
321
322=head2 drain
323
324 $content->on(drain => sub {
325 my ($content, $offset) = @_;
326 ...
327 });
328
329Emitted once all data has been written.
330
331 $content->on(drain => sub {
332 my $content = shift;
333 $content->write_chunk(time);
334 });
335
336=head2 read
337
338 $content->on(read => sub {
339 my ($content, $bytes) = @_;
340 ...
341 });
342
343Emitted when a new chunk of content arrives.
344
345 $content->on(read => sub {
346 my ($content, $bytes) = @_;
347 say "Streaming: $bytes";
348 });
349
350=head1 ATTRIBUTES
351
352L<Mojo::Content> implements the following attributes.
353
354=head2 auto_decompress
355
356 my $bool = $content->auto_decompress;
357 $content = $content->auto_decompress($bool);
358
359Decompress content automatically if L</"is_compressed"> is true.
360
361=head2 auto_relax
362
363 my $bool = $content->auto_relax;
364 $content = $content->auto_relax($bool);
365
366Try to detect when relaxed parsing is necessary.
367
368=head2 expect_close
369
370 my $bool = $content->expect_close;
371 $content = $content->expect_close($bool);
372
373Expect a response that is terminated with a connection close.
374
375=head2 headers
376
377 my $headers = $content->headers;
378 $content = $content->headers(Mojo::Headers->new);
379
380Content headers, defaults to a L<Mojo::Headers> object.
381
382=head2 max_buffer_size
383
384 my $size = $content->max_buffer_size;
385 $content = $content->max_buffer_size(1024);
386
387Maximum size in bytes of buffer for content parser, defaults to the value of
388the C<MOJO_MAX_BUFFER_SIZE> environment variable or C<262144> (256KiB).
389
390=head2 max_leftover_size
391
392 my $size = $content->max_leftover_size;
393 $content = $content->max_leftover_size(1024);
394
395Maximum size in bytes of buffer for pipelined HTTP requests, defaults to the
396value of the C<MOJO_MAX_LEFTOVER_SIZE> environment variable or C<262144>
397(256KiB).
398
399=head2 relaxed
400
401 my $bool = $content->relaxed;
402 $content = $content->relaxed($bool);
403
404Activate relaxed parsing for responses that are terminated with a connection
405close.
406
407=head2 skip_body
408
409 my $bool = $content->skip_body;
410 $content = $content->skip_body($bool);
411
412Skip body parsing and finish after headers.
413
414=head1 METHODS
415
416L<Mojo::Content> inherits all methods from L<Mojo::EventEmitter> and implements
417the following new ones.
418
419=head2 body_contains
420
421 my $bool = $content->body_contains('foo bar baz');
422
423Check if content contains a specific string. Meant to be overloaded in a
424subclass.
425
426=head2 body_size
427
428 my $size = $content->body_size;
429
430Content size in bytes. Meant to be overloaded in a subclass.
431
432=head2 boundary
433
434 my $boundary = $content->boundary;
435
436Extract multipart boundary from C<Content-Type> header.
437
438=head2 charset
439
440 my $charset = $content->charset;
441
442Extract charset from C<Content-Type> header.
443
444=head2 clone
445
446 my $clone = $content->clone;
447
448Return a new L<Mojo::Content> object cloned from this content if possible,
449otherwise return C<undef>.
450
451=head2 generate_body_chunk
452
453 my $bytes = $content->generate_body_chunk(0);
454
455Generate dynamic content.
456
457=head2 get_body_chunk
458
459 my $bytes = $content->get_body_chunk(0);
460
461Get a chunk of content starting from a specific position. Meant to be
462overloaded in a subclass.
463
464=head2 get_header_chunk
465
466 my $bytes = $content->get_header_chunk(13);
467
468Get a chunk of the headers starting from a specific position. Note that this
469method finalizes the content.
470
471=head2 header_size
472
473 my $size = $content->header_size;
474
475Size of headers in bytes. Note that this method finalizes the content.
476
477=head2 headers_contain
478
479 my $bool = $content->headers_contain('foo bar baz');
480
481Check if headers contain a specific string. Note that this method finalizes the
482content.
483
484=head2 is_chunked
485
486 my $bool = $content->is_chunked;
487
488Check if C<Transfer-Encoding> header indicates chunked transfer encoding.
489
490=head2 is_compressed
491
492 my $bool = $content->is_compressed;
493
494Check C<Content-Encoding> header for C<gzip> value.
495
496=head2 is_dynamic
497
498 my $bool = $content->is_dynamic;
499
500Check if content will be dynamically generated, which prevents L</"clone"> from
501working.
502
503=head2 is_finished
504
505 my $bool = $content->is_finished;
506
507Check if parser is finished.
508
509=head2 is_limit_exceeded
510
511 my $bool = $content->is_limit_exceeded;
512
513Check if buffer has exceeded L</"max_buffer_size">.
514
515=head2 is_multipart
516
517 my $bool = $content->is_multipart;
518
519False, this is not a L<Mojo::Content::MultiPart> object.
520
521=head2 is_parsing_body
522
523 my $bool = $content->is_parsing_body;
524
525Check if body parsing started yet.
526
527=head2 leftovers
528
529 my $bytes = $content->leftovers;
530
531Get leftover data from content parser.
532
533=head2 parse
534
535 $content
536 = $content->parse("Content-Length: 12\x0d\x0a\x0d\x0aHello World!");
537
538Parse content chunk.
539
540=head2 parse_body
541
542 $content = $content->parse_body('Hi!');
543
544Parse body chunk and skip headers.
545
546=head2 progress
547
548 my $size = $content->progress;
549
550Size of content already received from message in bytes.
551
552=head2 write
553
554 $content = $content->write;
555 $content = $content->write('');
556 $content = $content->write($bytes);
557 $content = $content->write($bytes => sub {...});
558
559Write dynamic content non-blocking, the optional drain callback will be executed
560once all data has been written. Calling this method without a chunk of data
561will finalize the L</"headers"> and allow for dynamic content to be written
562later. You can write an empty chunk of data at any time to end the stream.
563
564 # Make sure previous chunk of data has been written before continuing
565 $content->write('He' => sub {
566 my $content = shift;
567 $content->write('llo!' => sub {
568 my $content = shift;
569 $content->write('');
570 });
571 });
572
573=head2 write_chunk
574
575 $content = $content->write_chunk;
576 $content = $content->write_chunk('');
577 $content = $content->write_chunk($bytes);
578 $content = $content->write_chunk($bytes => sub {...});
579
580Write dynamic content non-blocking with chunked transfer encoding, the optional
581drain callback will be executed once all data has been written. Calling this
582method without a chunk of data will finalize the L</"headers"> and allow for
583dynamic content to be written later. You can write an empty chunk of data at any
584time to end the stream.
585
586 # Make sure previous chunk of data has been written before continuing
587 $content->write_chunk('He' => sub {
588 my $content = shift;
589 $content->write_chunk('llo!' => sub {
590 my $content = shift;
591 $content->write_chunk('');
592 });
593 });
594
595=head1 SEE ALSO
596
597L<Mojolicious>, L<Mojolicious::Guides>, L<https://mojolicious.org>.
598
599=cut
Note: See TracBrowser for help on using the repository browser.