source: main/trunk/greenstone2/perllib/cpan/Mojo/URL.pm

Last change on this file was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File size: 12.6 KB
Line 
1package Mojo::URL;
2use Mojo::Base -base;
3use overload bool => sub {1}, '""' => sub { shift->to_string }, fallback => 1;
4
5use Mojo::Parameters;
6use Mojo::Path;
7use Mojo::Util
8 qw(decode encode punycode_decode punycode_encode url_escape url_unescape);
9
10has base => sub { Mojo::URL->new };
11has [qw(fragment host port scheme userinfo)];
12
13sub clone {
14 my $self = shift;
15 my $clone = $self->new;
16 @$clone{keys %$self} = values %$self;
17 $clone->{$_} && ($clone->{$_} = $clone->{$_}->clone) for qw(base path query);
18 return $clone;
19}
20
21sub host_port {
22 my ($self, $host_port) = @_;
23
24 if (defined $host_port) {
25 $self->port($1) if $host_port =~ s/:(\d+)$//;
26 my $host = url_unescape $host_port;
27 return $host =~ /[^\x00-\x7f]/ ? $self->ihost($host) : $self->host($host);
28 }
29
30 return undef unless defined(my $host = $self->ihost);
31 return $host unless defined(my $port = $self->port);
32 return "$host:$port";
33}
34
35sub ihost {
36 my $self = shift;
37
38 # Decode
39 return $self->host(join '.',
40 map { /^xn--(.+)$/ ? punycode_decode $1 : $_ } split(/\./, shift, -1))
41 if @_;
42
43 # Check if host needs to be encoded
44 return undef unless defined(my $host = $self->host);
45 return $host unless $host =~ /[^\x00-\x7f]/;
46
47 # Encode
48 return join '.',
49 map { /[^\x00-\x7f]/ ? ('xn--' . punycode_encode $_) : $_ }
50 split(/\./, $host, -1);
51}
52
53sub is_abs { !!shift->scheme }
54
55sub new { @_ > 1 ? shift->SUPER::new->parse(@_) : shift->SUPER::new }
56
57sub parse {
58 my ($self, $url) = @_;
59
60 # Official regex from RFC 3986
61 $url =~ m!^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?!;
62 $self->scheme($2) if defined $2;
63 $self->path($5) if defined $5;
64 $self->query($7) if defined $7;
65 $self->fragment(_decode(url_unescape $9)) if defined $9;
66 if (defined(my $auth = $4)) {
67 $self->userinfo(_decode(url_unescape $1)) if $auth =~ s/^([^\@]+)\@//;
68 $self->host_port($auth);
69 }
70
71 return $self;
72}
73
74sub password { (shift->userinfo // '') =~ /:(.*)$/ ? $1 : undef }
75
76sub path {
77 my $self = shift;
78
79 # Old path
80 $self->{path} ||= Mojo::Path->new;
81 return $self->{path} unless @_;
82
83 # New path
84 $self->{path} = ref $_[0] ? $_[0] : $self->{path}->merge($_[0]);
85
86 return $self;
87}
88
89sub path_query {
90 my ($self, $pq) = @_;
91
92 if (defined $pq) {
93 return $self unless $pq =~ /^([^?#]*)(?:\?([^#]*))?/;
94 return defined $2 ? $self->path($1)->query($2) : $self->path($1);
95 }
96
97 my $query = $self->query->to_string;
98 return $self->path->to_string . (length $query ? "?$query" : '');
99}
100
101sub protocol { lc(shift->scheme // '') }
102
103sub query {
104 my $self = shift;
105
106 # Old parameters
107 my $q = $self->{query} ||= Mojo::Parameters->new;
108 return $q unless @_;
109
110 # Replace with list
111 if (@_ > 1) { $q->pairs([])->parse(@_) }
112
113 # Merge with array
114 elsif (ref $_[0] eq 'ARRAY') { $q->merge(@{$_[0]}) }
115
116 # Append hash
117 elsif (ref $_[0] eq 'HASH') { $q->append(%{$_[0]}) }
118
119 # New parameters
120 else { $self->{query} = ref $_[0] ? $_[0] : $q->parse($_[0]) }
121
122 return $self;
123}
124
125sub to_abs {
126 my $self = shift;
127
128 my $abs = $self->clone;
129 return $abs if $abs->is_abs;
130
131 # Scheme
132 my $base = shift || $abs->base;
133 $abs->base($base)->scheme($base->scheme);
134
135 # Authority
136 return $abs if $abs->host;
137 $abs->userinfo($base->userinfo)->host($base->host)->port($base->port);
138
139 # Absolute path
140 my $path = $abs->path;
141 return $abs if $path->leading_slash;
142
143 # Inherit path
144 if (!@{$path->parts}) {
145 $abs->path($base->path->clone->canonicalize);
146
147 # Query
148 $abs->query($base->query->clone) unless length $abs->query->to_string;
149 }
150
151 # Merge paths
152 else { $abs->path($base->path->clone->merge($path)->canonicalize) }
153
154 return $abs;
155}
156
157sub to_string { shift->_string(0) }
158sub to_unsafe_string { shift->_string(1) }
159
160sub username { (shift->userinfo // '') =~ /^([^:]+)/ ? $1 : undef }
161
162sub _decode { decode('UTF-8', $_[0]) // $_[0] }
163
164sub _encode { url_escape encode('UTF-8', $_[0]), $_[1] }
165
166sub _string {
167 my ($self, $unsafe) = @_;
168
169 # Scheme
170 my $url = '';
171 if (my $proto = $self->protocol) { $url .= "$proto:" }
172
173 # Authority
174 my $auth = $self->host_port;
175 $auth = _encode($auth, '^A-Za-z0-9\-._~!$&\'()*+,;=:\[\]') if defined $auth;
176 if ($unsafe && defined(my $info = $self->userinfo)) {
177 $auth = _encode($info, '^A-Za-z0-9\-._~!$&\'()*+,;=:') . '@' . $auth;
178 }
179 $url .= "//$auth" if defined $auth;
180
181 # Path and query
182 my $path = $self->path_query;
183 $url .= !$auth || !length $path || $path =~ m!^[/?]! ? $path : "/$path";
184
185 # Fragment
186 return $url unless defined(my $fragment = $self->fragment);
187 return $url . '#' . _encode($fragment, '^A-Za-z0-9\-._~!$&\'()*+,;=:@/?');
188}
189
1901;
191
192=encoding utf8
193
194=head1 NAME
195
196Mojo::URL - Uniform Resource Locator
197
198=head1 SYNOPSIS
199
200 use Mojo::URL;
201
202 # Parse
203 my $url = Mojo::URL->new('http://sri:[email protected]:3000/foo?foo=bar#23');
204 say $url->scheme;
205 say $url->userinfo;
206 say $url->host;
207 say $url->port;
208 say $url->path;
209 say $url->query;
210 say $url->fragment;
211
212 # Build
213 my $url = Mojo::URL->new;
214 $url->scheme('http');
215 $url->host('example.com');
216 $url->port(3000);
217 $url->path('/foo/bar');
218 $url->query(foo => 'bar');
219 $url->fragment(23);
220 say "$url";
221
222=head1 DESCRIPTION
223
224L<Mojo::URL> implements a subset of
225L<RFC 3986|http://tools.ietf.org/html/rfc3986>,
226L<RFC 3987|http://tools.ietf.org/html/rfc3987> and the
227L<URL Living Standard|https://url.spec.whatwg.org> for Uniform Resource
228Locators with support for IDNA and IRIs.
229
230=head1 ATTRIBUTES
231
232L<Mojo::URL> implements the following attributes.
233
234=head2 base
235
236 my $base = $url->base;
237 $url = $url->base(Mojo::URL->new);
238
239Base of this URL, defaults to a L<Mojo::URL> object.
240
241 "http://example.com/a/b?c"
242 Mojo::URL->new("/a/b?c")->base(Mojo::URL->new("http://example.com"))->to_abs;
243
244=head2 fragment
245
246 my $fragment = $url->fragment;
247 $url = $url->fragment('♥mojolicious♥');
248
249Fragment part of this URL.
250
251 # "yada"
252 Mojo::URL->new('http://example.com/foo?bar=baz#yada')->fragment;
253
254=head2 host
255
256 my $host = $url->host;
257 $url = $url->host('127.0.0.1');
258
259Host part of this URL.
260
261 # "example.com"
262 Mojo::URL->new('http://sri:[email protected]:8080/foo')->host;
263
264=head2 port
265
266 my $port = $url->port;
267 $url = $url->port(8080);
268
269Port part of this URL.
270
271 # "8080"
272 Mojo::URL->new('http://sri:[email protected]:8080/foo')->port;
273
274=head2 scheme
275
276 my $scheme = $url->scheme;
277 $url = $url->scheme('http');
278
279Scheme part of this URL.
280
281 # "http"
282 Mojo::URL->new('http://example.com/foo')->scheme;
283
284=head2 userinfo
285
286 my $info = $url->userinfo;
287 $url = $url->userinfo('root:♥');
288
289Userinfo part of this URL.
290
291 # "sri:t3st"
292 Mojo::URL->new('https://sri:[email protected]/foo')->userinfo;
293
294=head1 METHODS
295
296L<Mojo::URL> inherits all methods from L<Mojo::Base> and implements the
297following new ones.
298
299=head2 clone
300
301 my $url2 = $url->clone;
302
303Return a new L<Mojo::URL> object cloned from this URL.
304
305=head2 host_port
306
307 my $host_port = $url->host_port;
308 $url = $url->host_port('example.com:8080');
309
310Normalized version of L</"host"> and L</"port">.
311
312 # "xn--n3h.net:8080"
313 Mojo::URL->new('http://☃.net:8080/test')->host_port;
314
315 # "example.com"
316 Mojo::URL->new('http://example.com/test')->host_port;
317
318=head2 ihost
319
320 my $ihost = $url->ihost;
321 $url = $url->ihost('xn--bcher-kva.ch');
322
323Host part of this URL in punycode format.
324
325 # "xn--n3h.net"
326 Mojo::URL->new('http://☃.net')->ihost;
327
328 # "example.com"
329 Mojo::URL->new('http://example.com')->ihost;
330
331=head2 is_abs
332
333 my $bool = $url->is_abs;
334
335Check if URL is absolute.
336
337 # True
338 Mojo::URL->new('http://example.com')->is_abs;
339 Mojo::URL->new('http://example.com/test/index.html')->is_abs;
340
341 # False
342 Mojo::URL->new('test/index.html')->is_abs;
343 Mojo::URL->new('/test/index.html')->is_abs;
344 Mojo::URL->new('//example.com/test/index.html')->is_abs;
345
346=head2 new
347
348 my $url = Mojo::URL->new;
349 my $url = Mojo::URL->new('http://127.0.0.1:3000/foo?f=b&baz=2#foo');
350
351Construct a new L<Mojo::URL> object and L</"parse"> URL if necessary.
352
353=head2 parse
354
355 $url = $url->parse('http://127.0.0.1:3000/foo/bar?fo=o&baz=23#foo');
356
357Parse relative or absolute URL.
358
359 # "/test/123"
360 $url->parse('/test/123?foo=bar')->path;
361
362 # "example.com"
363 $url->parse('http://example.com/test/123?foo=bar')->host;
364
365 # "[email protected]"
366 $url->parse('mailto:[email protected]')->path;
367
368=head2 password
369
370 my $password = $url->password;
371
372Password part of L</"userinfo">.
373
374 # "s3cret"
375 Mojo::URL->new('http://isabel:[email protected]')->password;
376
377 # "s:3:c:r:e:t"
378 Mojo::URL->new('http://isabel:s:3:c:r:e:[email protected]')->password;
379
380=head2 path
381
382 my $path = $url->path;
383 $url = $url->path('foo/bar');
384 $url = $url->path('/foo/bar');
385 $url = $url->path(Mojo::Path->new);
386
387Path part of this URL, relative paths will be merged with
388L<Mojo::Path/"merge">, defaults to a L<Mojo::Path> object.
389
390 # "perldoc"
391 Mojo::URL->new('http://example.com/perldoc/Mojo')->path->parts->[0];
392
393 # "/perldoc/DOM/HTML"
394 Mojo::URL->new('http://example.com/perldoc/Mojo')->path->merge('DOM/HTML');
395
396 # "http://example.com/DOM/HTML"
397 Mojo::URL->new('http://example.com/perldoc/Mojo')->path('/DOM/HTML');
398
399 # "http://example.com/perldoc/DOM/HTML"
400 Mojo::URL->new('http://example.com/perldoc/Mojo')->path('DOM/HTML');
401
402 # "http://example.com/perldoc/Mojo/DOM/HTML"
403 Mojo::URL->new('http://example.com/perldoc/Mojo/')->path('DOM/HTML');
404
405=head2 path_query
406
407 my $path_query = $url->path_query;
408 $url = $url->path_query('/foo/bar?a=1&b=2');
409
410Normalized version of L</"path"> and L</"query">.
411
412 # "/test?a=1&b=2"
413 Mojo::URL->new('http://example.com/test?a=1&b=2')->path_query;
414
415 # "/"
416 Mojo::URL->new('http://example.com/')->path_query;
417
418=head2 protocol
419
420 my $proto = $url->protocol;
421
422Normalized version of L</"scheme">.
423
424 # "http"
425 Mojo::URL->new('HtTp://example.com')->protocol;
426
427=head2 query
428
429 my $query = $url->query;
430 $url = $url->query([merge => 'with']);
431 $url = $url->query({append => 'to'});
432 $url = $url->query(replace => 'with');
433 $url = $url->query('a=1&b=2');
434 $url = $url->query(Mojo::Parameters->new);
435
436Query part of this URL, key/value pairs in an array reference will be merged
437with L<Mojo::Parameters/"merge">, and key/value pairs in a hash reference
438appended with L<Mojo::Parameters/"append">, defaults to a L<Mojo::Parameters>
439object.
440
441 # "2"
442 Mojo::URL->new('http://example.com?a=1&b=2')->query->param('b');
443
444 # "a=2&b=2&c=3"
445 Mojo::URL->new('http://example.com?a=1&b=2')->query->merge(a => 2, c => 3);
446
447 # "http://example.com?a=2&c=3"
448 Mojo::URL->new('http://example.com?a=1&b=2')->query(a => 2, c => 3);
449
450 # "http://example.com?a=2&a=3"
451 Mojo::URL->new('http://example.com?a=1&b=2')->query(a => [2, 3]);
452
453 # "http://example.com?a=2&b=2&c=3"
454 Mojo::URL->new('http://example.com?a=1&b=2')->query([a => 2, c => 3]);
455
456 # "http://example.com?b=2"
457 Mojo::URL->new('http://example.com?a=1&b=2')->query([a => undef]);
458
459 # "http://example.com?a=1&b=2&a=2&c=3"
460 Mojo::URL->new('http://example.com?a=1&b=2')->query({a => 2, c => 3});
461
462=head2 to_abs
463
464 my $abs = $url->to_abs;
465 my $abs = $url->to_abs(Mojo::URL->new('http://example.com/foo'));
466
467Return a new L<Mojo::URL> object cloned from this relative URL and turn it into
468an absolute one using L</"base"> or provided base URL.
469
470 # "http://example.com/foo/baz.xml?test=123"
471 Mojo::URL->new('baz.xml?test=123')
472 ->to_abs(Mojo::URL->new('http://example.com/foo/bar.html'));
473
474 # "http://example.com/baz.xml?test=123"
475 Mojo::URL->new('/baz.xml?test=123')
476 ->to_abs(Mojo::URL->new('http://example.com/foo/bar.html'));
477
478 # "http://example.com/foo/baz.xml?test=123"
479 Mojo::URL->new('//example.com/foo/baz.xml?test=123')
480 ->to_abs(Mojo::URL->new('http://example.com/foo/bar.html'));
481
482=head2 to_string
483
484 my $str = $url->to_string;
485
486Turn URL into a string. Note that L</"userinfo"> will not be included for
487security reasons.
488
489 # "http://mojolicious.org"
490 Mojo::URL->new->scheme('http')->host('mojolicious.org')->to_string;
491
492 # "http://mojolicious.org"
493 Mojo::URL->new('http://daniel:[email protected]')->to_string;
494
495=head2 to_unsafe_string
496
497 my $str = $url->to_unsafe_string;
498
499Same as L</"to_string">, but includes L</"userinfo">.
500
501 # "http://daniel:[email protected]"
502 Mojo::URL->new('http://daniel:[email protected]')->to_unsafe_string;
503
504=head2 username
505
506 my $username = $url->username;
507
508Username part of L</"userinfo">.
509
510 # "isabel"
511 Mojo::URL->new('http://isabel:[email protected]')->username;
512
513=head1 OPERATORS
514
515L<Mojo::URL> overloads the following operators.
516
517=head2 bool
518
519 my $bool = !!$url;
520
521Always true.
522
523=head2 stringify
524
525 my $str = "$url";
526
527Alias for L</"to_string">.
528
529=head1 SEE ALSO
530
531L<Mojolicious>, L<Mojolicious::Guides>, L<https://mojolicious.org>.
532
533=cut
Note: See TracBrowser for help on using the repository browser.