source: for-distributions/trunk/bin/windows/perl/lib/Text/Balanced.pm@ 14489

Last change on this file since 14489 was 14489, checked in by oranfry, 17 years ago

upgrading to perl 5.8

File size: 65.8 KB
Line 
1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4use 5.005;
5use strict;
6
7package Text::Balanced;
8
9use Exporter;
10use SelfLoader;
11use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
13$VERSION = '1.95';
14@ISA = qw ( Exporter );
15
16%EXPORT_TAGS = ( ALL => [ qw(
17 &extract_delimited
18 &extract_bracketed
19 &extract_quotelike
20 &extract_codeblock
21 &extract_variable
22 &extract_tagged
23 &extract_multiple
24
25 &gen_delimited_pat
26 &gen_extract_tagged
27
28 &delimited_pat
29 ) ] );
30
31Exporter::export_ok_tags('ALL');
32
33# PROTOTYPES
34
35sub _match_bracketed($$$$$$);
36sub _match_variable($$);
37sub _match_codeblock($$$$$$$);
38sub _match_quotelike($$$$);
39
40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42sub _failmsg {
43 my ($message, $pos) = @_;
44 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45}
46
47sub _fail
48{
49 my ($wantarray, $textref, $message, $pos) = @_;
50 _failmsg $message, $pos if $message;
51 return ("",$$textref,"") if $wantarray;
52 return undef;
53}
54
55sub _succeed
56{
57 $@ = undef;
58 my ($wantarray,$textref) = splice @_, 0, 2;
59 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60 my ($startlen) = $_[5];
61 my $remainderpos = $_[2];
62 if ($wantarray)
63 {
64 my @res;
65 while (my ($from, $len) = splice @_, 0, 2)
66 {
67 push @res, substr($$textref,$from,$len);
68 }
69 if ($extralen) { # CORRECT FILLET
70 my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71 $res[1] = "$extra$res[1]";
72 eval { substr($$textref,$remainderpos,0) = $extra;
73 substr($$textref,$extrapos,$extralen,"\n")} ;
74 #REARRANGE HERE DOC AND FILLET IF POSSIBLE
75 pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76 }
77 else {
78 pos($$textref) = $remainderpos; # RESET \G
79 }
80 return @res;
81 }
82 else
83 {
84 my $match = substr($$textref,$_[0],$_[1]);
85 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86 my $extra = $extralen
87 ? substr($$textref, $extrapos, $extralen)."\n" : "";
88 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89 pos($$textref) = $_[4]; # RESET \G
90 return $match;
91 }
92}
93
94# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96sub gen_delimited_pat($;$) # ($delimiters;$escapes)
97{
98 my ($dels, $escs) = @_;
99 return "" unless $dels =~ /\S/;
100 $escs = '\\' unless $escs;
101 $escs .= substr($escs,-1) x (length($dels)-length($escs));
102 my @pat = ();
103 my $i;
104 for ($i=0; $i<length $dels; $i++)
105 {
106 my $del = quotemeta substr($dels,$i,1);
107 my $esc = quotemeta substr($escs,$i,1);
108 if ($del eq $esc)
109 {
110 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111 }
112 else
113 {
114 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115 }
116 }
117 my $pat = join '|', @pat;
118 return "(?:$pat)";
119}
120
121*delimited_pat = \&gen_delimited_pat;
122
123
124# THE EXTRACTION FUNCTIONS
125
126sub extract_delimited (;$$$$)
127{
128 my $textref = defined $_[0] ? \$_[0] : \$_;
129 my $wantarray = wantarray;
130 my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
131 my $pre = defined $_[2] ? $_[2] : '\s*';
132 my $esc = defined $_[3] ? $_[3] : qq{\\};
133 my $pat = gen_delimited_pat($del, $esc);
134 my $startpos = pos $$textref || 0;
135 return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136 unless $$textref =~ m/\G($pre)($pat)/gc;
137 my $prelen = length($1);
138 my $matchpos = $startpos+$prelen;
139 my $endpos = pos $$textref;
140 return _succeed $wantarray, $textref,
141 $matchpos, $endpos-$matchpos, # MATCH
142 $endpos, length($$textref)-$endpos, # REMAINDER
143 $startpos, $prelen; # PREFIX
144}
145
146sub extract_bracketed (;$$$)
147{
148 my $textref = defined $_[0] ? \$_[0] : \$_;
149 my $ldel = defined $_[1] ? $_[1] : '{([<';
150 my $pre = defined $_[2] ? $_[2] : '\s*';
151 my $wantarray = wantarray;
152 my $qdel = "";
153 my $quotelike;
154 $ldel =~ s/'//g and $qdel .= q{'};
155 $ldel =~ s/"//g and $qdel .= q{"};
156 $ldel =~ s/`//g and $qdel .= q{`};
157 $ldel =~ s/q//g and $quotelike = 1;
158 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159 my $rdel = $ldel;
160 unless ($rdel =~ tr/[({</])}>/)
161 {
162 return _fail $wantarray, $textref,
163 "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164 0;
165 }
166 my $posbug = pos;
167 $ldel = join('|', map { quotemeta $_ } split('', $ldel));
168 $rdel = join('|', map { quotemeta $_ } split('', $rdel));
169 pos = $posbug;
170
171 my $startpos = pos $$textref || 0;
172 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174 return _fail ($wantarray, $textref) unless @match;
175
176 return _succeed ( $wantarray, $textref,
177 $match[2], $match[5]+2, # MATCH
178 @match[8,9], # REMAINDER
179 @match[0,1], # PREFIX
180 );
181}
182
183sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184{
185 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187 unless ($$textref =~ m/\G$pre/gc)
188 {
189 _failmsg "Did not find prefix: /$pre/", $startpos;
190 return;
191 }
192
193 $ldelpos = pos $$textref;
194
195 unless ($$textref =~ m/\G($ldel)/gc)
196 {
197 _failmsg "Did not find opening bracket after prefix: \"$pre\"",
198 pos $$textref;
199 pos $$textref = $startpos;
200 return;
201 }
202
203 my @nesting = ( $1 );
204 my $textlen = length $$textref;
205 while (pos $$textref < $textlen)
206 {
207 next if $$textref =~ m/\G\\./gcs;
208
209 if ($$textref =~ m/\G($ldel)/gc)
210 {
211 push @nesting, $1;
212 }
213 elsif ($$textref =~ m/\G($rdel)/gc)
214 {
215 my ($found, $brackettype) = ($1, $1);
216 if ($#nesting < 0)
217 {
218 _failmsg "Unmatched closing bracket: \"$found\"",
219 pos $$textref;
220 pos $$textref = $startpos;
221 return;
222 }
223 my $expected = pop(@nesting);
224 $expected =~ tr/({[</)}]>/;
225 if ($expected ne $brackettype)
226 {
227 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228 pos $$textref;
229 pos $$textref = $startpos;
230 return;
231 }
232 last if $#nesting < 0;
233 }
234 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235 {
236 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
237 _failmsg "Unmatched embedded quote ($1)",
238 pos $$textref;
239 pos $$textref = $startpos;
240 return;
241 }
242 elsif ($quotelike && _match_quotelike($textref,"",1,0))
243 {
244 next;
245 }
246
247 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248 }
249 if ($#nesting>=0)
250 {
251 _failmsg "Unmatched opening bracket(s): "
252 . join("..",@nesting)."..",
253 pos $$textref;
254 pos $$textref = $startpos;
255 return;
256 }
257
258 $endpos = pos $$textref;
259
260 return (
261 $startpos, $ldelpos-$startpos, # PREFIX
262 $ldelpos, 1, # OPENING BRACKET
263 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
264 $endpos-1, 1, # CLOSING BRACKET
265 $endpos, length($$textref)-$endpos, # REMAINDER
266 );
267}
268
269sub revbracket($)
270{
271 my $brack = reverse $_[0];
272 $brack =~ tr/[({</])}>/;
273 return $brack;
274}
275
276my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279{
280 my $textref = defined $_[0] ? \$_[0] : \$_;
281 my $ldel = $_[1];
282 my $rdel = $_[2];
283 my $pre = defined $_[3] ? $_[3] : '\s*';
284 my %options = defined $_[4] ? %{$_[4]} : ();
285 my $omode = defined $options{fail} ? $options{fail} : '';
286 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287 : defined($options{reject}) ? $options{reject}
288 : ''
289 ;
290 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291 : defined($options{ignore}) ? $options{ignore}
292 : ''
293 ;
294
295 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296 $@ = undef;
297
298 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300 return _fail(wantarray, $textref) unless @match;
301 return _succeed wantarray, $textref,
302 $match[2], $match[3]+$match[5]+$match[7], # MATCH
303 @match[8..9,0..1,2..7]; # REM, PRE, BITS
304}
305
306sub _match_tagged # ($$$$$$$)
307{
308 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309 my $rdelspec;
310
311 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312
313 unless ($$textref =~ m/\G($pre)/gc)
314 {
315 _failmsg "Did not find prefix: /$pre/", pos $$textref;
316 goto failed;
317 }
318
319 $opentagpos = pos($$textref);
320
321 unless ($$textref =~ m/\G$ldel/gc)
322 {
323 _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324 goto failed;
325 }
326
327 $textpos = pos($$textref);
328
329 if (!defined $rdel)
330 {
331 $rdelspec = $&;
332 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333 {
334 _failmsg "Unable to construct closing tag to match: $rdel",
335 pos $$textref;
336 goto failed;
337 }
338 }
339 else
340 {
341 $rdelspec = eval "qq{$rdel}" || do {
342 my $del;
343 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
344 { next if $rdel =~ /\Q$_/; $del = $_; last }
345 unless ($del) {
346 use Carp;
347 croak "Can't interpolate right delimiter $rdel"
348 }
349 eval "qq$del$rdel$del";
350 };
351 }
352
353 while (pos($$textref) < length($$textref))
354 {
355 next if $$textref =~ m/\G\\./gc;
356
357 if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
358 {
359 $parapos = pos($$textref) - length($1)
360 unless defined $parapos;
361 }
362 elsif ($$textref =~ m/\G($rdelspec)/gc )
363 {
364 $closetagpos = pos($$textref)-length($1);
365 goto matched;
366 }
367 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
368 {
369 next;
370 }
371 elsif ($bad && $$textref =~ m/\G($bad)/gcs)
372 {
373 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
374 goto short if ($omode eq 'PARA' || $omode eq 'MAX');
375 _failmsg "Found invalid nested tag: $1", pos $$textref;
376 goto failed;
377 }
378 elsif ($$textref =~ m/\G($ldel)/gc)
379 {
380 my $tag = $1;
381 pos($$textref) -= length($tag); # REWIND TO NESTED TAG
382 unless (_match_tagged(@_)) # MATCH NESTED TAG
383 {
384 goto short if $omode eq 'PARA' || $omode eq 'MAX';
385 _failmsg "Found unbalanced nested tag: $tag",
386 pos $$textref;
387 goto failed;
388 }
389 }
390 else { $$textref =~ m/./gcs }
391 }
392
393short:
394 $closetagpos = pos($$textref);
395 goto matched if $omode eq 'MAX';
396 goto failed unless $omode eq 'PARA';
397
398 if (defined $parapos) { pos($$textref) = $parapos }
399 else { $parapos = pos($$textref) }
400
401 return (
402 $startpos, $opentagpos-$startpos, # PREFIX
403 $opentagpos, $textpos-$opentagpos, # OPENING TAG
404 $textpos, $parapos-$textpos, # TEXT
405 $parapos, 0, # NO CLOSING TAG
406 $parapos, length($$textref)-$parapos, # REMAINDER
407 );
408
409matched:
410 $endpos = pos($$textref);
411 return (
412 $startpos, $opentagpos-$startpos, # PREFIX
413 $opentagpos, $textpos-$opentagpos, # OPENING TAG
414 $textpos, $closetagpos-$textpos, # TEXT
415 $closetagpos, $endpos-$closetagpos, # CLOSING TAG
416 $endpos, length($$textref)-$endpos, # REMAINDER
417 );
418
419failed:
420 _failmsg "Did not find closing tag", pos $$textref unless $@;
421 pos($$textref) = $startpos;
422 return;
423}
424
425sub extract_variable (;$$)
426{
427 my $textref = defined $_[0] ? \$_[0] : \$_;
428 return ("","","") unless defined $$textref;
429 my $pre = defined $_[1] ? $_[1] : '\s*';
430
431 my @match = _match_variable($textref,$pre);
432
433 return _fail wantarray, $textref unless @match;
434
435 return _succeed wantarray, $textref,
436 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
437}
438
439sub _match_variable($$)
440{
441# $#
442# $^
443# $$
444 my ($textref, $pre) = @_;
445 my $startpos = pos($$textref) = pos($$textref)||0;
446 unless ($$textref =~ m/\G($pre)/gc)
447 {
448 _failmsg "Did not find prefix: /$pre/", pos $$textref;
449 return;
450 }
451 my $varpos = pos($$textref);
452 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
453 {
454 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
455 {
456 _failmsg "Did not find leading dereferencer", pos $$textref;
457 pos $$textref = $startpos;
458 return;
459 }
460 my $deref = $1;
461
462 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
463 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
464 or $deref eq '$#' or $deref eq '$$' )
465 {
466 _failmsg "Bad identifier after dereferencer", pos $$textref;
467 pos $$textref = $startpos;
468 return;
469 }
470 }
471
472 while (1)
473 {
474 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
475 next if _match_codeblock($textref,
476 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
477 qr/[({[]/, qr/[)}\]]/,
478 qr/[({[]/, qr/[)}\]]/, 0);
479 next if _match_codeblock($textref,
480 qr/\s*/, qr/[{[]/, qr/[}\]]/,
481 qr/[{[]/, qr/[}\]]/, 0);
482 next if _match_variable($textref,'\s*->\s*');
483 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
484 last;
485 }
486
487 my $endpos = pos($$textref);
488 return ($startpos, $varpos-$startpos,
489 $varpos, $endpos-$varpos,
490 $endpos, length($$textref)-$endpos
491 );
492}
493
494sub extract_codeblock (;$$$$$)
495{
496 my $textref = defined $_[0] ? \$_[0] : \$_;
497 my $wantarray = wantarray;
498 my $ldel_inner = defined $_[1] ? $_[1] : '{';
499 my $pre = defined $_[2] ? $_[2] : '\s*';
500 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
501 my $rd = $_[4];
502 my $rdel_inner = $ldel_inner;
503 my $rdel_outer = $ldel_outer;
504 my $posbug = pos;
505 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
506 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
507 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
508 {
509 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
510 }
511 pos = $posbug;
512
513 my @match = _match_codeblock($textref, $pre,
514 $ldel_outer, $rdel_outer,
515 $ldel_inner, $rdel_inner,
516 $rd);
517 return _fail($wantarray, $textref) unless @match;
518 return _succeed($wantarray, $textref,
519 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
520 );
521
522}
523
524sub _match_codeblock($$$$$$$)
525{
526 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
527 my $startpos = pos($$textref) = pos($$textref) || 0;
528 unless ($$textref =~ m/\G($pre)/gc)
529 {
530 _failmsg qq{Did not match prefix /$pre/ at"} .
531 substr($$textref,pos($$textref),20) .
532 q{..."},
533 pos $$textref;
534 return;
535 }
536 my $codepos = pos($$textref);
537 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
538 {
539 _failmsg qq{Did not find expected opening bracket at "} .
540 substr($$textref,pos($$textref),20) .
541 q{..."},
542 pos $$textref;
543 pos $$textref = $startpos;
544 return;
545 }
546 my $closing = $1;
547 $closing =~ tr/([<{/)]>}/;
548 my $matched;
549 my $patvalid = 1;
550 while (pos($$textref) < length($$textref))
551 {
552 $matched = '';
553 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
554 {
555 $patvalid = 0;
556 next;
557 }
558
559 if ($$textref =~ m/\G\s*#.*/gc)
560 {
561 next;
562 }
563
564 if ($$textref =~ m/\G\s*($rdel_outer)/gc)
565 {
566 unless ($matched = ($closing && $1 eq $closing) )
567 {
568 next if $1 eq '>'; # MIGHT BE A "LESS THAN"
569 _failmsg q{Mismatched closing bracket at "} .
570 substr($$textref,pos($$textref),20) .
571 qq{...". Expected '$closing'},
572 pos $$textref;
573 }
574 last;
575 }
576
577 if (_match_variable($textref,'\s*') ||
578 _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
579 {
580 $patvalid = 0;
581 next;
582 }
583
584
585 # NEED TO COVER MANY MORE CASES HERE!!!
586 if ($$textref =~ m#\G\s*(?!$ldel_inner)
587 ( [-+*x/%^&|.]=?
588 | [!=]~
589 | =(?!>)
590 | (\*\*|&&|\|\||<<|>>)=?
591 | split|grep|map|return
592 | [([]
593 )#gcx)
594 {
595 $patvalid = 1;
596 next;
597 }
598
599 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
600 {
601 $patvalid = 1;
602 next;
603 }
604
605 if ($$textref =~ m/\G\s*$ldel_outer/gc)
606 {
607 _failmsg q{Improperly nested codeblock at "} .
608 substr($$textref,pos($$textref),20) .
609 q{..."},
610 pos $$textref;
611 last;
612 }
613
614 $patvalid = 0;
615 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
616 }
617 continue { $@ = undef }
618
619 unless ($matched)
620 {
621 _failmsg 'No match found for opening bracket', pos $$textref
622 unless $@;
623 return;
624 }
625
626 my $endpos = pos($$textref);
627 return ( $startpos, $codepos-$startpos,
628 $codepos, $endpos-$codepos,
629 $endpos, length($$textref)-$endpos,
630 );
631}
632
633
634my %mods = (
635 'none' => '[cgimsox]*',
636 'm' => '[cgimsox]*',
637 's' => '[cegimsox]*',
638 'tr' => '[cds]*',
639 'y' => '[cds]*',
640 'qq' => '',
641 'qx' => '',
642 'qw' => '',
643 'qr' => '[imsx]*',
644 'q' => '',
645 );
646
647sub extract_quotelike (;$$)
648{
649 my $textref = $_[0] ? \$_[0] : \$_;
650 my $wantarray = wantarray;
651 my $pre = defined $_[1] ? $_[1] : '\s*';
652
653 my @match = _match_quotelike($textref,$pre,1,0);
654 return _fail($wantarray, $textref) unless @match;
655 return _succeed($wantarray, $textref,
656 $match[2], $match[18]-$match[2], # MATCH
657 @match[18,19], # REMAINDER
658 @match[0,1], # PREFIX
659 @match[2..17], # THE BITS
660 @match[20,21], # ANY FILLET?
661 );
662};
663
664sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
665{
666 my ($textref, $pre, $rawmatch, $qmark) = @_;
667
668 my ($textlen,$startpos,
669 $oppos,
670 $preld1pos,$ld1pos,$str1pos,$rd1pos,
671 $preld2pos,$ld2pos,$str2pos,$rd2pos,
672 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
673
674 unless ($$textref =~ m/\G($pre)/gc)
675 {
676 _failmsg qq{Did not find prefix /$pre/ at "} .
677 substr($$textref, pos($$textref), 20) .
678 q{..."},
679 pos $$textref;
680 return;
681 }
682 $oppos = pos($$textref);
683
684 my $initial = substr($$textref,$oppos,1);
685
686 if ($initial && $initial =~ m|^[\"\'\`]|
687 || $rawmatch && $initial =~ m|^/|
688 || $qmark && $initial =~ m|^\?|)
689 {
690 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
691 {
692 _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
693 substr($$textref, $oppos, 20) .
694 q{..."},
695 pos $$textref;
696 pos $$textref = $startpos;
697 return;
698 }
699 $modpos= pos($$textref);
700 $rd1pos = $modpos-1;
701
702 if ($initial eq '/' || $initial eq '?')
703 {
704 $$textref =~ m/\G$mods{none}/gc
705 }
706
707 my $endpos = pos($$textref);
708 return (
709 $startpos, $oppos-$startpos, # PREFIX
710 $oppos, 0, # NO OPERATOR
711 $oppos, 1, # LEFT DEL
712 $oppos+1, $rd1pos-$oppos-1, # STR/PAT
713 $rd1pos, 1, # RIGHT DEL
714 $modpos, 0, # NO 2ND LDEL
715 $modpos, 0, # NO 2ND STR
716 $modpos, 0, # NO 2ND RDEL
717 $modpos, $endpos-$modpos, # MODIFIERS
718 $endpos, $textlen-$endpos, # REMAINDER
719 );
720 }
721
722 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
723 {
724 _failmsg q{No quotelike operator found after prefix at "} .
725 substr($$textref, pos($$textref), 20) .
726 q{..."},
727 pos $$textref;
728 pos $$textref = $startpos;
729 return;
730 }
731
732 my $op = $1;
733 $preld1pos = pos($$textref);
734 if ($op eq '<<') {
735 $ld1pos = pos($$textref);
736 my $label;
737 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
738 $label = $1;
739 }
740 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
741 | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
742 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
743 }gcsx) {
744 $label = $+;
745 }
746 else {
747 $label = "";
748 }
749 my $extrapos = pos($$textref);
750 $$textref =~ m{.*\n}gc;
751 $str1pos = pos($$textref);
752 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
753 _failmsg qq{Missing here doc terminator ('$label') after "} .
754 substr($$textref, $startpos, 20) .
755 q{..."},
756 pos $$textref;
757 pos $$textref = $startpos;
758 return;
759 }
760 $rd1pos = pos($$textref);
761 $$textref =~ m{$label\n}gc;
762 $ld2pos = pos($$textref);
763 return (
764 $startpos, $oppos-$startpos, # PREFIX
765 $oppos, length($op), # OPERATOR
766 $ld1pos, $extrapos-$ld1pos, # LEFT DEL
767 $str1pos, $rd1pos-$str1pos, # STR/PAT
768 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
769 $ld2pos, 0, # NO 2ND LDEL
770 $ld2pos, 0, # NO 2ND STR
771 $ld2pos, 0, # NO 2ND RDEL
772 $ld2pos, 0, # NO MODIFIERS
773 $ld2pos, $textlen-$ld2pos, # REMAINDER
774 $extrapos, $str1pos-$extrapos, # FILLETED BIT
775 );
776 }
777
778 $$textref =~ m/\G\s*/gc;
779 $ld1pos = pos($$textref);
780 $str1pos = $ld1pos+1;
781
782 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
783 {
784 _failmsg "No block delimiter found after quotelike $op",
785 pos $$textref;
786 pos $$textref = $startpos;
787 return;
788 }
789 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
790 my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
791 if ($ldel1 =~ /[[(<{]/)
792 {
793 $rdel1 =~ tr/[({</])}>/;
794 _match_bracketed($textref,"",$ldel1,"","",$rdel1)
795 || do { pos $$textref = $startpos; return };
796 }
797 else
798 {
799 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
800 || do { pos $$textref = $startpos; return };
801 }
802 $ld2pos = $rd1pos = pos($$textref)-1;
803
804 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
805 if ($second_arg)
806 {
807 my ($ldel2, $rdel2);
808 if ($ldel1 =~ /[[(<{]/)
809 {
810 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
811 {
812 _failmsg "Missing second block for quotelike $op",
813 pos $$textref;
814 pos $$textref = $startpos;
815 return;
816 }
817 $ldel2 = $rdel2 = "\Q$1";
818 $rdel2 =~ tr/[({</])}>/;
819 }
820 else
821 {
822 $ldel2 = $rdel2 = $ldel1;
823 }
824 $str2pos = $ld2pos+1;
825
826 if ($ldel2 =~ /[[(<{]/)
827 {
828 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
829 _match_bracketed($textref,"",$ldel2,"","",$rdel2)
830 || do { pos $$textref = $startpos; return };
831 }
832 else
833 {
834 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
835 || do { pos $$textref = $startpos; return };
836 }
837 $rd2pos = pos($$textref)-1;
838 }
839 else
840 {
841 $ld2pos = $str2pos = $rd2pos = $rd1pos;
842 }
843
844 $modpos = pos $$textref;
845
846 $$textref =~ m/\G($mods{$op})/gc;
847 my $endpos = pos $$textref;
848
849 return (
850 $startpos, $oppos-$startpos, # PREFIX
851 $oppos, length($op), # OPERATOR
852 $ld1pos, 1, # LEFT DEL
853 $str1pos, $rd1pos-$str1pos, # STR/PAT
854 $rd1pos, 1, # RIGHT DEL
855 $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
856 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
857 $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
858 $modpos, $endpos-$modpos, # MODIFIERS
859 $endpos, $textlen-$endpos, # REMAINDER
860 );
861}
862
863my $def_func =
864[
865 sub { extract_variable($_[0], '') },
866 sub { extract_quotelike($_[0],'') },
867 sub { extract_codeblock($_[0],'{}','') },
868];
869
870sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
871{
872 my $textref = defined($_[0]) ? \$_[0] : \$_;
873 my $posbug = pos;
874 my ($lastpos, $firstpos);
875 my @fields = ();
876
877 #for ($$textref)
878 {
879 my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
880 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
881 my $igunk = $_[3];
882
883 pos $$textref ||= 0;
884
885 unless (wantarray)
886 {
887 use Carp;
888 carp "extract_multiple reset maximal count to 1 in scalar context"
889 if $^W && defined($_[2]) && $max > 1;
890 $max = 1
891 }
892
893 my $unkpos;
894 my $func;
895 my $class;
896
897 my @class;
898 foreach $func ( @func )
899 {
900 if (ref($func) eq 'HASH')
901 {
902 push @class, (keys %$func)[0];
903 $func = (values %$func)[0];
904 }
905 else
906 {
907 push @class, undef;
908 }
909 }
910
911 FIELD: while (pos($$textref) < length($$textref))
912 {
913 my ($field, $rem);
914 my @bits;
915 foreach my $i ( 0..$#func )
916 {
917 my $pref;
918 $func = $func[$i];
919 $class = $class[$i];
920 $lastpos = pos $$textref;
921 if (ref($func) eq 'CODE')
922 { ($field,$rem,$pref) = @bits = $func->($$textref);
923 # print "[$field|$rem]" if $field;
924 }
925 elsif (ref($func) eq 'Text::Balanced::Extractor')
926 { @bits = $field = $func->extract($$textref) }
927 elsif( $$textref =~ m/\G$func/gc )
928 { @bits = $field = defined($1) ? $1 : $& }
929 $pref ||= "";
930 if (defined($field) && length($field))
931 {
932 if (!$igunk) {
933 $unkpos = pos $$textref
934 if length($pref) && !defined($unkpos);
935 if (defined $unkpos)
936 {
937 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
938 $firstpos = $unkpos unless defined $firstpos;
939 undef $unkpos;
940 last FIELD if @fields == $max;
941 }
942 }
943 push @fields, $class
944 ? bless (\$field, $class)
945 : $field;
946 $firstpos = $lastpos unless defined $firstpos;
947 $lastpos = pos $$textref;
948 last FIELD if @fields == $max;
949 next FIELD;
950 }
951 }
952 if ($$textref =~ /\G(.)/gcs)
953 {
954 $unkpos = pos($$textref)-1
955 unless $igunk || defined $unkpos;
956 }
957 }
958
959 if (defined $unkpos)
960 {
961 push @fields, substr($$textref, $unkpos);
962 $firstpos = $unkpos unless defined $firstpos;
963 $lastpos = length $$textref;
964 }
965 last;
966 }
967
968 pos $$textref = $lastpos;
969 return @fields if wantarray;
970
971 $firstpos ||= 0;
972 eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
973 pos $$textref = $firstpos };
974 return $fields[0];
975}
976
977
978sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
979{
980 my $ldel = $_[0];
981 my $rdel = $_[1];
982 my $pre = defined $_[2] ? $_[2] : '\s*';
983 my %options = defined $_[3] ? %{$_[3]} : ();
984 my $omode = defined $options{fail} ? $options{fail} : '';
985 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
986 : defined($options{reject}) ? $options{reject}
987 : ''
988 ;
989 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
990 : defined($options{ignore}) ? $options{ignore}
991 : ''
992 ;
993
994 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
995
996 my $posbug = pos;
997 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
998 pos = $posbug;
999
1000 my $closure = sub
1001 {
1002 my $textref = defined $_[0] ? \$_[0] : \$_;
1003 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1004
1005 return _fail(wantarray, $textref) unless @match;
1006 return _succeed wantarray, $textref,
1007 $match[2], $match[3]+$match[5]+$match[7], # MATCH
1008 @match[8..9,0..1,2..7]; # REM, PRE, BITS
1009 };
1010
1011 bless $closure, 'Text::Balanced::Extractor';
1012}
1013
1014package Text::Balanced::Extractor;
1015
1016sub extract($$) # ($self, $text)
1017{
1018 &{$_[0]}($_[1]);
1019}
1020
1021package Text::Balanced::ErrorMsg;
1022
1023use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1024
10251;
1026
1027__END__
1028
1029=head1 NAME
1030
1031Text::Balanced - Extract delimited text sequences from strings.
1032
1033
1034=head1 SYNOPSIS
1035
1036 use Text::Balanced qw (
1037 extract_delimited
1038 extract_bracketed
1039 extract_quotelike
1040 extract_codeblock
1041 extract_variable
1042 extract_tagged
1043 extract_multiple
1044
1045 gen_delimited_pat
1046 gen_extract_tagged
1047 );
1048
1049 # Extract the initial substring of $text that is delimited by
1050 # two (unescaped) instances of the first character in $delim.
1051
1052 ($extracted, $remainder) = extract_delimited($text,$delim);
1053
1054
1055 # Extract the initial substring of $text that is bracketed
1056 # with a delimiter(s) specified by $delim (where the string
1057 # in $delim contains one or more of '(){}[]<>').
1058
1059 ($extracted, $remainder) = extract_bracketed($text,$delim);
1060
1061
1062 # Extract the initial substring of $text that is bounded by
1063 # an XML tag.
1064
1065 ($extracted, $remainder) = extract_tagged($text);
1066
1067
1068 # Extract the initial substring of $text that is bounded by
1069 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1070
1071 ($extracted, $remainder) =
1072 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1073
1074
1075 # Extract the initial substring of $text that represents a
1076 # Perl "quote or quote-like operation"
1077
1078 ($extracted, $remainder) = extract_quotelike($text);
1079
1080
1081 # Extract the initial substring of $text that represents a block
1082 # of Perl code, bracketed by any of character(s) specified by $delim
1083 # (where the string $delim contains one or more of '(){}[]<>').
1084
1085 ($extracted, $remainder) = extract_codeblock($text,$delim);
1086
1087
1088 # Extract the initial substrings of $text that would be extracted by
1089 # one or more sequential applications of the specified functions
1090 # or regular expressions
1091
1092 @extracted = extract_multiple($text,
1093 [ \&extract_bracketed,
1094 \&extract_quotelike,
1095 \&some_other_extractor_sub,
1096 qr/[xyz]*/,
1097 'literal',
1098 ]);
1099
1100# Create a string representing an optimized pattern (a la Friedl)
1101# that matches a substring delimited by any of the specified characters
1102# (in this case: any type of quote or a slash)
1103
1104 $patstring = gen_delimited_pat(q{'"`/});
1105
1106
1107# Generate a reference to an anonymous sub that is just like extract_tagged
1108# but pre-compiled and optimized for a specific pair of tags, and consequently
1109# much faster (i.e. 3 times faster). It uses qr// for better performance on
1110# repeated calls, so it only works under Perl 5.005 or later.
1111
1112 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1113
1114 ($extracted, $remainder) = $extract_head->($text);
1115
1116
1117=head1 DESCRIPTION
1118
1119The various C<extract_...> subroutines may be used to
1120extract a delimited substring, possibly after skipping a
1121specified prefix string. By default, that prefix is
1122optional whitespace (C</\s*/>), but you can change it to whatever
1123you wish (see below).
1124
1125The substring to be extracted must appear at the
1126current C<pos> location of the string's variable
1127(or at index zero, if no C<pos> position is defined).
1128In other words, the C<extract_...> subroutines I<don't>
1129extract the first occurance of a substring anywhere
1130in a string (like an unanchored regex would). Rather,
1131they extract an occurance of the substring appearing
1132immediately at the current matching position in the
1133string (like a C<\G>-anchored regex would).
1134
1135
1136
1137=head2 General behaviour in list contexts
1138
1139In a list context, all the subroutines return a list, the first three
1140elements of which are always:
1141
1142=over 4
1143
1144=item [0]
1145
1146The extracted string, including the specified delimiters.
1147If the extraction fails an empty string is returned.
1148
1149=item [1]
1150
1151The remainder of the input string (i.e. the characters after the
1152extracted string). On failure, the entire string is returned.
1153
1154=item [2]
1155
1156The skipped prefix (i.e. the characters before the extracted string).
1157On failure, the empty string is returned.
1158
1159=back
1160
1161Note that in a list context, the contents of the original input text (the first
1162argument) are not modified in any way.
1163
1164However, if the input text was passed in a variable, that variable's
1165C<pos> value is updated to point at the first character after the
1166extracted text. That means that in a list context the various
1167subroutines can be used much like regular expressions. For example:
1168
1169 while ( $next = (extract_quotelike($text))[0] )
1170 {
1171 # process next quote-like (in $next)
1172 }
1173
1174
1175=head2 General behaviour in scalar and void contexts
1176
1177In a scalar context, the extracted string is returned, having first been
1178removed from the input text. Thus, the following code also processes
1179each quote-like operation, but actually removes them from $text:
1180
1181 while ( $next = extract_quotelike($text) )
1182 {
1183 # process next quote-like (in $next)
1184 }
1185
1186Note that if the input text is a read-only string (i.e. a literal),
1187no attempt is made to remove the extracted text.
1188
1189In a void context the behaviour of the extraction subroutines is
1190exactly the same as in a scalar context, except (of course) that the
1191extracted substring is not returned.
1192
1193=head2 A note about prefixes
1194
1195Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1196This can bite you if you're expecting a prefix specification like
1197'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1198pattern will only succeed if the <H1> tag is on the current line, since
1199. normally doesn't match newlines.
1200
1201To overcome this limitation, you need to turn on /s matching within
1202the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1203
1204
1205=head2 C<extract_delimited>
1206
1207The C<extract_delimited> function formalizes the common idiom
1208of extracting a single-character-delimited substring from the start of
1209a string. For example, to extract a single-quote delimited string, the
1210following code is typically used:
1211
1212 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1213 $extracted = $1;
1214
1215but with C<extract_delimited> it can be simplified to:
1216
1217 ($extracted,$remainder) = extract_delimited($text, "'");
1218
1219C<extract_delimited> takes up to four scalars (the input text, the
1220delimiters, a prefix pattern to be skipped, and any escape characters)
1221and extracts the initial substring of the text that
1222is appropriately delimited. If the delimiter string has multiple
1223characters, the first one encountered in the text is taken to delimit
1224the substring.
1225The third argument specifies a prefix pattern that is to be skipped
1226(but must be present!) before the substring is extracted.
1227The final argument specifies the escape character to be used for each
1228delimiter.
1229
1230All arguments are optional. If the escape characters are not specified,
1231every delimiter is escaped with a backslash (C<\>).
1232If the prefix is not specified, the
1233pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1234is also not specified, the set C</["'`]/> is used. If the text to be processed
1235is not specified either, C<$_> is used.
1236
1237In list context, C<extract_delimited> returns a array of three
1238elements, the extracted substring (I<including the surrounding
1239delimiters>), the remainder of the text, and the skipped prefix (if
1240any). If a suitable delimited substring is not found, the first
1241element of the array is the empty string, the second is the complete
1242original text, and the prefix returned in the third element is an
1243empty string.
1244
1245In a scalar context, just the extracted substring is returned. In
1246a void context, the extracted substring (and any prefix) are simply
1247removed from the beginning of the first argument.
1248
1249Examples:
1250
1251 # Remove a single-quoted substring from the very beginning of $text:
1252
1253 $substring = extract_delimited($text, "'", '');
1254
1255 # Remove a single-quoted Pascalish substring (i.e. one in which
1256 # doubling the quote character escapes it) from the very
1257 # beginning of $text:
1258
1259 $substring = extract_delimited($text, "'", '', "'");
1260
1261 # Extract a single- or double- quoted substring from the
1262 # beginning of $text, optionally after some whitespace
1263 # (note the list context to protect $text from modification):
1264
1265 ($substring) = extract_delimited $text, q{"'};
1266
1267
1268 # Delete the substring delimited by the first '/' in $text:
1269
1270 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1271
1272Note that this last example is I<not> the same as deleting the first
1273quote-like pattern. For instance, if C<$text> contained the string:
1274
1275 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1276
1277then after the deletion it would contain:
1278
1279 "if ('.$UNIXCMD/s) { $cmd = $1; }"
1280
1281not:
1282
1283 "if ('./cmd' =~ ms) { $cmd = $1; }"
1284
1285
1286See L<"extract_quotelike"> for a (partial) solution to this problem.
1287
1288
1289=head2 C<extract_bracketed>
1290
1291Like C<"extract_delimited">, the C<extract_bracketed> function takes
1292up to three optional scalar arguments: a string to extract from, a delimiter
1293specifier, and a prefix pattern. As before, a missing prefix defaults to
1294optional whitespace and a missing text defaults to C<$_>. However, a missing
1295delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1296
1297C<extract_bracketed> extracts a balanced-bracket-delimited
1298substring (using any one (or more) of the user-specified delimiter
1299brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1300respect quoted unbalanced brackets (see below).
1301
1302A "delimiter bracket" is a bracket in list of delimiters passed as
1303C<extract_bracketed>'s second argument. Delimiter brackets are
1304specified by giving either the left or right (or both!) versions
1305of the required bracket(s). Note that the order in which
1306two or more delimiter brackets are specified is not significant.
1307
1308A "balanced-bracket-delimited substring" is a substring bounded by
1309matched brackets, such that any other (left or right) delimiter
1310bracket I<within> the substring is also matched by an opposite
1311(right or left) delimiter bracket I<at the same level of nesting>. Any
1312type of bracket not in the delimiter list is treated as an ordinary
1313character.
1314
1315In other words, each type of bracket specified as a delimiter must be
1316balanced and correctly nested within the substring, and any other kind of
1317("non-delimiter") bracket in the substring is ignored.
1318
1319For example, given the string:
1320
1321 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1322
1323then a call to C<extract_bracketed> in a list context:
1324
1325 @result = extract_bracketed( $text, '{}' );
1326
1327would return:
1328
1329 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1330
1331since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1332(In a scalar context just the first element of the array would be returned. In
1333a void context, C<$text> would be replaced by an empty string.)
1334
1335Likewise the call in:
1336
1337 @result = extract_bracketed( $text, '{[' );
1338
1339would return the same result, since all sets of both types of specified
1340delimiter brackets are correctly nested and balanced.
1341
1342However, the call in:
1343
1344 @result = extract_bracketed( $text, '{([<' );
1345
1346would fail, returning:
1347
1348 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1349
1350because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1351the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1352return an empty string. In a void context, C<$text> would be unchanged.)
1353
1354Note that the embedded single-quotes in the string don't help in this
1355case, since they have not been specified as acceptable delimiters and are
1356therefore treated as non-delimiter characters (and ignored).
1357
1358However, if a particular species of quote character is included in the
1359delimiter specification, then that type of quote will be correctly handled.
1360for example, if C<$text> is:
1361
1362 $text = '<A HREF=">>>>">link</A>';
1363
1364then
1365
1366 @result = extract_bracketed( $text, '<">' );
1367
1368returns:
1369
1370 ( '<A HREF=">>>>">', 'link</A>', "" )
1371
1372as expected. Without the specification of C<"> as an embedded quoter:
1373
1374 @result = extract_bracketed( $text, '<>' );
1375
1376the result would be:
1377
1378 ( '<A HREF=">', '>>>">link</A>', "" )
1379
1380In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1381quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1382letter 'q' as a delimiter. Hence:
1383
1384 @result = extract_bracketed( $text, '<q>' );
1385
1386would correctly match something like this:
1387
1388 $text = '<leftop: conj /and/ conj>';
1389
1390See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1391
1392
1393=head2 C<extract_variable>
1394
1395C<extract_variable> extracts any valid Perl variable or
1396variable-involved expression, including scalars, arrays, hashes, array
1397accesses, hash look-ups, method calls through objects, subroutine calles
1398through subroutine references, etc.
1399
1400The subroutine takes up to two optional arguments:
1401
1402=over 4
1403
1404=item 1.
1405
1406A string to be processed (C<$_> if the string is omitted or C<undef>)
1407
1408=item 2.
1409
1410A string specifying a pattern to be matched as a prefix (which is to be
1411skipped). If omitted, optional whitespace is skipped.
1412
1413=back
1414
1415On success in a list context, an array of 3 elements is returned. The
1416elements are:
1417
1418=over 4
1419
1420=item [0]
1421
1422the extracted variable, or variablish expression
1423
1424=item [1]
1425
1426the remainder of the input text,
1427
1428=item [2]
1429
1430the prefix substring (if any),
1431
1432=back
1433
1434On failure, all of these values (except the remaining text) are C<undef>.
1435
1436In a scalar context, C<extract_variable> returns just the complete
1437substring that matched a variablish expression. C<undef> is returned on
1438failure. In addition, the original input text has the returned substring
1439(and any prefix) removed from it.
1440
1441In a void context, the input text just has the matched substring (and
1442any specified prefix) removed.
1443
1444
1445=head2 C<extract_tagged>
1446
1447C<extract_tagged> extracts and segments text between (balanced)
1448specified tags.
1449
1450The subroutine takes up to five optional arguments:
1451
1452=over 4
1453
1454=item 1.
1455
1456A string to be processed (C<$_> if the string is omitted or C<undef>)
1457
1458=item 2.
1459
1460A string specifying a pattern to be matched as the opening tag.
1461If the pattern string is omitted (or C<undef>) then a pattern
1462that matches any standard XML tag is used.
1463
1464=item 3.
1465
1466A string specifying a pattern to be matched at the closing tag.
1467If the pattern string is omitted (or C<undef>) then the closing
1468tag is constructed by inserting a C</> after any leading bracket
1469characters in the actual opening tag that was matched (I<not> the pattern
1470that matched the tag). For example, if the opening tag pattern
1471is specified as C<'{{\w+}}'> and actually matched the opening tag
1472C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1473
1474=item 4.
1475
1476A string specifying a pattern to be matched as a prefix (which is to be
1477skipped). If omitted, optional whitespace is skipped.
1478
1479=item 5.
1480
1481A hash reference containing various parsing options (see below)
1482
1483=back
1484
1485The various options that can be specified are:
1486
1487=over 4
1488
1489=item C<reject =E<gt> $listref>
1490
1491The list reference contains one or more strings specifying patterns
1492that must I<not> appear within the tagged text.
1493
1494For example, to extract
1495an HTML link (which should not contain nested links) use:
1496
1497 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1498
1499=item C<ignore =E<gt> $listref>
1500
1501The list reference contains one or more strings specifying patterns
1502that are I<not> be be treated as nested tags within the tagged text
1503(even if they would match the start tag pattern).
1504
1505For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1506
1507 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1508
1509(also see L<"gen_delimited_pat"> below).
1510
1511
1512=item C<fail =E<gt> $str>
1513
1514The C<fail> option indicates the action to be taken if a matching end
1515tag is not encountered (i.e. before the end of the string or some
1516C<reject> pattern matches). By default, a failure to match a closing
1517tag causes C<extract_tagged> to immediately fail.
1518
1519However, if the string value associated with <reject> is "MAX", then
1520C<extract_tagged> returns the complete text up to the point of failure.
1521If the string is "PARA", C<extract_tagged> returns only the first paragraph
1522after the tag (up to the first line that is either empty or contains
1523only whitespace characters).
1524If the string is "", the the default behaviour (i.e. failure) is reinstated.
1525
1526For example, suppose the start tag "/para" introduces a paragraph, which then
1527continues until the next "/endpara" tag or until another "/para" tag is
1528encountered:
1529
1530 $text = "/para line 1\n\nline 3\n/para line 4";
1531
1532 extract_tagged($text, '/para', '/endpara', undef,
1533 {reject => '/para', fail => MAX );
1534
1535 # EXTRACTED: "/para line 1\n\nline 3\n"
1536
1537Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1538tag refers only to the immediately following paragraph:
1539
1540 $text = "/para line 1\n\nline 3\n/para line 4";
1541
1542 extract_tagged($text, '/para', '/endpara', undef,
1543 {reject => '/para', fail => MAX );
1544
1545 # EXTRACTED: "/para line 1\n"
1546
1547Note that the specified C<fail> behaviour applies to nested tags as well.
1548
1549=back
1550
1551On success in a list context, an array of 6 elements is returned. The elements are:
1552
1553=over 4
1554
1555=item [0]
1556
1557the extracted tagged substring (including the outermost tags),
1558
1559=item [1]
1560
1561the remainder of the input text,
1562
1563=item [2]
1564
1565the prefix substring (if any),
1566
1567=item [3]
1568
1569the opening tag
1570
1571=item [4]
1572
1573the text between the opening and closing tags
1574
1575=item [5]
1576
1577the closing tag (or "" if no closing tag was found)
1578
1579=back
1580
1581On failure, all of these values (except the remaining text) are C<undef>.
1582
1583In a scalar context, C<extract_tagged> returns just the complete
1584substring that matched a tagged text (including the start and end
1585tags). C<undef> is returned on failure. In addition, the original input
1586text has the returned substring (and any prefix) removed from it.
1587
1588In a void context, the input text just has the matched substring (and
1589any specified prefix) removed.
1590
1591
1592=head2 C<gen_extract_tagged>
1593
1594(Note: This subroutine is only available under Perl5.005)
1595
1596C<gen_extract_tagged> generates a new anonymous subroutine which
1597extracts text between (balanced) specified tags. In other words,
1598it generates a function identical in function to C<extract_tagged>.
1599
1600The difference between C<extract_tagged> and the anonymous
1601subroutines generated by
1602C<gen_extract_tagged>, is that those generated subroutines:
1603
1604=over 4
1605
1606=item *
1607
1608do not have to reparse tag specification or parsing options every time
1609they are called (whereas C<extract_tagged> has to effectively rebuild
1610its tag parser on every call);
1611
1612=item *
1613
1614make use of the new qr// construct to pre-compile the regexes they use
1615(whereas C<extract_tagged> uses standard string variable interpolation
1616to create tag-matching patterns).
1617
1618=back
1619
1620The subroutine takes up to four optional arguments (the same set as
1621C<extract_tagged> except for the string to be processed). It returns
1622a reference to a subroutine which in turn takes a single argument (the text to
1623be extracted from).
1624
1625In other words, the implementation of C<extract_tagged> is exactly
1626equivalent to:
1627
1628 sub extract_tagged
1629 {
1630 my $text = shift;
1631 $extractor = gen_extract_tagged(@_);
1632 return $extractor->($text);
1633 }
1634
1635(although C<extract_tagged> is not currently implemented that way, in order
1636to preserve pre-5.005 compatibility).
1637
1638Using C<gen_extract_tagged> to create extraction functions for specific tags
1639is a good idea if those functions are going to be called more than once, since
1640their performance is typically twice as good as the more general-purpose
1641C<extract_tagged>.
1642
1643
1644=head2 C<extract_quotelike>
1645
1646C<extract_quotelike> attempts to recognize, extract, and segment any
1647one of the various Perl quotes and quotelike operators (see
1648L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1649delimiters (for the quotelike operators), and trailing modifiers are
1650all caught. For example, in:
1651
1652 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1653
1654 extract_quotelike ' "You said, \"Use sed\"." '
1655
1656 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1657
1658 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1659
1660the full Perl quotelike operations are all extracted correctly.
1661
1662Note too that, when using the /x modifier on a regex, any comment
1663containing the current pattern delimiter will cause the regex to be
1664immediately terminated. In other words:
1665
1666 'm /
1667 (?i) # CASE INSENSITIVE
1668 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
1669 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1670 /x'
1671
1672will be extracted as if it were:
1673
1674 'm /
1675 (?i) # CASE INSENSITIVE
1676 [a-z_] # LEADING ALPHABETIC/'
1677
1678This behaviour is identical to that of the actual compiler.
1679
1680C<extract_quotelike> takes two arguments: the text to be processed and
1681a prefix to be matched at the very beginning of the text. If no prefix
1682is specified, optional whitespace is the default. If no text is given,
1683C<$_> is used.
1684
1685In a list context, an array of 11 elements is returned. The elements are:
1686
1687=over 4
1688
1689=item [0]
1690
1691the extracted quotelike substring (including trailing modifiers),
1692
1693=item [1]
1694
1695the remainder of the input text,
1696
1697=item [2]
1698
1699the prefix substring (if any),
1700
1701=item [3]
1702
1703the name of the quotelike operator (if any),
1704
1705=item [4]
1706
1707the left delimiter of the first block of the operation,
1708
1709=item [5]
1710
1711the text of the first block of the operation
1712(that is, the contents of
1713a quote, the regex of a match or substitution or the target list of a
1714translation),
1715
1716=item [6]
1717
1718the right delimiter of the first block of the operation,
1719
1720=item [7]
1721
1722the left delimiter of the second block of the operation
1723(that is, if it is a C<s>, C<tr>, or C<y>),
1724
1725=item [8]
1726
1727the text of the second block of the operation
1728(that is, the replacement of a substitution or the translation list
1729of a translation),
1730
1731=item [9]
1732
1733the right delimiter of the second block of the operation (if any),
1734
1735=item [10]
1736
1737the trailing modifiers on the operation (if any).
1738
1739=back
1740
1741For each of the fields marked "(if any)" the default value on success is
1742an empty string.
1743On failure, all of these values (except the remaining text) are C<undef>.
1744
1745
1746In a scalar context, C<extract_quotelike> returns just the complete substring
1747that matched a quotelike operation (or C<undef> on failure). In a scalar or
1748void context, the input text has the same substring (and any specified
1749prefix) removed.
1750
1751Examples:
1752
1753 # Remove the first quotelike literal that appears in text
1754
1755 $quotelike = extract_quotelike($text,'.*?');
1756
1757 # Replace one or more leading whitespace-separated quotelike
1758 # literals in $_ with "<QLL>"
1759
1760 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1761
1762
1763 # Isolate the search pattern in a quotelike operation from $text
1764
1765 ($op,$pat) = (extract_quotelike $text)[3,5];
1766 if ($op =~ /[ms]/)
1767 {
1768 print "search pattern: $pat\n";
1769 }
1770 else
1771 {
1772 print "$op is not a pattern matching operation\n";
1773 }
1774
1775
1776=head2 C<extract_quotelike> and "here documents"
1777
1778C<extract_quotelike> can successfully extract "here documents" from an input
1779string, but with an important caveat in list contexts.
1780
1781Unlike other types of quote-like literals, a here document is rarely
1782a contiguous substring. For example, a typical piece of code using
1783here document might look like this:
1784
1785 <<'EOMSG' || die;
1786 This is the message.
1787 EOMSG
1788 exit;
1789
1790Given this as an input string in a scalar context, C<extract_quotelike>
1791would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1792leaving the string " || die;\nexit;" in the original variable. In other words,
1793the two separate pieces of the here document are successfully extracted and
1794concatenated.
1795
1796In a list context, C<extract_quotelike> would return the list
1797
1798=over 4
1799
1800=item [0]
1801
1802"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1803including fore and aft delimiters),
1804
1805=item [1]
1806
1807" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1808
1809=item [2]
1810
1811"" (i.e. the prefix substring -- trivial in this case),
1812
1813=item [3]
1814
1815"<<" (i.e. the "name" of the quotelike operator)
1816
1817=item [4]
1818
1819"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1820
1821=item [5]
1822
1823"This is the message.\n" (i.e. the text of the here document),
1824
1825=item [6]
1826
1827"EOMSG" (i.e. the right delimiter of the here document),
1828
1829=item [7..10]
1830
1831"" (a here document has no second left delimiter, second text, second right
1832delimiter, or trailing modifiers).
1833
1834=back
1835
1836However, the matching position of the input variable would be set to
1837"exit;" (i.e. I<after> the closing delimiter of the here document),
1838which would cause the earlier " || die;\nexit;" to be skipped in any
1839sequence of code fragment extractions.
1840
1841To avoid this problem, when it encounters a here document whilst
1842extracting from a modifiable string, C<extract_quotelike> silently
1843rearranges the string to an equivalent piece of Perl:
1844
1845 <<'EOMSG'
1846 This is the message.
1847 EOMSG
1848 || die;
1849 exit;
1850
1851in which the here document I<is> contiguous. It still leaves the
1852matching position after the here document, but now the rest of the line
1853on which the here document starts is not skipped.
1854
1855To prevent <extract_quotelike> from mucking about with the input in this way
1856(this is the only case where a list-context C<extract_quotelike> does so),
1857you can pass the input variable as an interpolated literal:
1858
1859 $quotelike = extract_quotelike("$var");
1860
1861
1862=head2 C<extract_codeblock>
1863
1864C<extract_codeblock> attempts to recognize and extract a balanced
1865bracket delimited substring that may contain unbalanced brackets
1866inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1867is like a combination of C<"extract_bracketed"> and
1868C<"extract_quotelike">.
1869
1870C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1871a text to process, a set of delimiter brackets to look for, and a prefix to
1872match first. It also takes an optional fourth parameter, which allows the
1873outermost delimiter brackets to be specified separately (see below).
1874
1875Omitting the first argument (input text) means process C<$_> instead.
1876Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1877Omitting the third argument (prefix argument) implies optional whitespace at the start.
1878Omitting the fourth argument (outermost delimiter brackets) indicates that the
1879value of the second argument is to be used for the outermost delimiters.
1880
1881Once the prefix an dthe outermost opening delimiter bracket have been
1882recognized, code blocks are extracted by stepping through the input text and
1883trying the following alternatives in sequence:
1884
1885=over 4
1886
1887=item 1.
1888
1889Try and match a closing delimiter bracket. If the bracket was the same
1890species as the last opening bracket, return the substring to that
1891point. If the bracket was mismatched, return an error.
1892
1893=item 2.
1894
1895Try to match a quote or quotelike operator. If found, call
1896C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1897the error it returned. Otherwise go back to step 1.
1898
1899=item 3.
1900
1901Try to match an opening delimiter bracket. If found, call
1902C<extract_codeblock> recursively to eat the embedded block. If the
1903recursive call fails, return an error. Otherwise, go back to step 1.
1904
1905=item 4.
1906
1907Unconditionally match a bareword or any other single character, and
1908then go back to step 1.
1909
1910=back
1911
1912
1913Examples:
1914
1915 # Find a while loop in the text
1916
1917 if ($text =~ s/.*?while\s*\{/{/)
1918 {
1919 $loop = "while " . extract_codeblock($text);
1920 }
1921
1922 # Remove the first round-bracketed list (which may include
1923 # round- or curly-bracketed code blocks or quotelike operators)
1924
1925 extract_codeblock $text, "(){}", '[^(]*';
1926
1927
1928The ability to specify a different outermost delimiter bracket is useful
1929in some circumstances. For example, in the Parse::RecDescent module,
1930parser actions which are to be performed only on a successful parse
1931are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1932
1933 sentence: subject verb object
1934 <defer: {$::theVerb = $item{verb}} >
1935
1936Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1937within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1938
1939A deferred action like this:
1940
1941 <defer: {if ($count>10) {$count--}} >
1942
1943will be incorrectly parsed as:
1944
1945 <defer: {if ($count>
1946
1947because the "less than" operator is interpreted as a closing delimiter.
1948
1949But, by extracting the directive using
1950S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1951the '>' character is only treated as a delimited at the outermost
1952level of the code block, so the directive is parsed correctly.
1953
1954=head2 C<extract_multiple>
1955
1956The C<extract_multiple> subroutine takes a string to be processed and a
1957list of extractors (subroutines or regular expressions) to apply to that string.
1958
1959In an array context C<extract_multiple> returns an array of substrings
1960of the original string, as extracted by the specified extractors.
1961In a scalar context, C<extract_multiple> returns the first
1962substring successfully extracted from the original string. In both
1963scalar and void contexts the original string has the first successfully
1964extracted substring removed from it. In all contexts
1965C<extract_multiple> starts at the current C<pos> of the string, and
1966sets that C<pos> appropriately after it matches.
1967
1968Hence, the aim of of a call to C<extract_multiple> in a list context
1969is to split the processed string into as many non-overlapping fields as
1970possible, by repeatedly applying each of the specified extractors
1971to the remainder of the string. Thus C<extract_multiple> is
1972a generalized form of Perl's C<split> subroutine.
1973
1974The subroutine takes up to four optional arguments:
1975
1976=over 4
1977
1978=item 1.
1979
1980A string to be processed (C<$_> if the string is omitted or C<undef>)
1981
1982=item 2.
1983
1984A reference to a list of subroutine references and/or qr// objects and/or
1985literal strings and/or hash references, specifying the extractors
1986to be used to split the string. If this argument is omitted (or
1987C<undef>) the list:
1988
1989 [
1990 sub { extract_variable($_[0], '') },
1991 sub { extract_quotelike($_[0],'') },
1992 sub { extract_codeblock($_[0],'{}','') },
1993 ]
1994
1995is used.
1996
1997
1998=item 3.
1999
2000An number specifying the maximum number of fields to return. If this
2001argument is omitted (or C<undef>), split continues as long as possible.
2002
2003If the third argument is I<N>, then extraction continues until I<N> fields
2004have been successfully extracted, or until the string has been completely
2005processed.
2006
2007Note that in scalar and void contexts the value of this argument is
2008automatically reset to 1 (under C<-w>, a warning is issued if the argument
2009has to be reset).
2010
2011=item 4.
2012
2013A value indicating whether unmatched substrings (see below) within the
2014text should be skipped or returned as fields. If the value is true,
2015such substrings are skipped. Otherwise, they are returned.
2016
2017=back
2018
2019The extraction process works by applying each extractor in
2020sequence to the text string.
2021
2022If the extractor is a subroutine it is called in a list context and is
2023expected to return a list of a single element, namely the extracted
2024text. It may optionally also return two further arguments: a string
2025representing the text left after extraction (like $' for a pattern
2026match), and a string representing any prefix skipped before the
2027extraction (like $` in a pattern match). Note that this is designed
2028to facilitate the use of other Text::Balanced subroutines with
2029C<extract_multiple>. Note too that the value returned by an extractor
2030subroutine need not bear any relationship to the corresponding substring
2031of the original text (see examples below).
2032
2033If the extractor is a precompiled regular expression or a string,
2034it is matched against the text in a scalar context with a leading
2035'\G' and the gc modifiers enabled. The extracted value is either
2036$1 if that variable is defined after the match, or else the
2037complete match (i.e. $&).
2038
2039If the extractor is a hash reference, it must contain exactly one element.
2040The value of that element is one of the
2041above extractor types (subroutine reference, regular expression, or string).
2042The key of that element is the name of a class into which the successful
2043return value of the extractor will be blessed.
2044
2045If an extractor returns a defined value, that value is immediately
2046treated as the next extracted field and pushed onto the list of fields.
2047If the extractor was specified in a hash reference, the field is also
2048blessed into the appropriate class,
2049
2050If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2051assumed to have failed to extract.
2052If none of the extractor subroutines succeeds, then one
2053character is extracted from the start of the text and the extraction
2054subroutines reapplied. Characters which are thus removed are accumulated and
2055eventually become the next field (unless the fourth argument is true, in which
2056case they are disgarded).
2057
2058For example, the following extracts substrings that are valid Perl variables:
2059
2060 @fields = extract_multiple($text,
2061 [ sub { extract_variable($_[0]) } ],
2062 undef, 1);
2063
2064This example separates a text into fields which are quote delimited,
2065curly bracketed, and anything else. The delimited and bracketed
2066parts are also blessed to identify them (the "anything else" is unblessed):
2067
2068 @fields = extract_multiple($text,
2069 [
2070 { Delim => sub { extract_delimited($_[0],q{'"}) } },
2071 { Brack => sub { extract_bracketed($_[0],'{}') } },
2072 ]);
2073
2074This call extracts the next single substring that is a valid Perl quotelike
2075operator (and removes it from $text):
2076
2077 $quotelike = extract_multiple($text,
2078 [
2079 sub { extract_quotelike($_[0]) },
2080 ], undef, 1);
2081
2082Finally, here is yet another way to do comma-separated value parsing:
2083
2084 @fields = extract_multiple($csv_text,
2085 [
2086 sub { extract_delimited($_[0],q{'"}) },
2087 qr/([^,]+)(.*)/,
2088 ],
2089 undef,1);
2090
2091The list in the second argument means:
2092I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2093The undef third argument means:
2094I<"...as many times as possible...">,
2095and the true value in the fourth argument means
2096I<"...discarding anything else that appears (i.e. the commas)">.
2097
2098If you wanted the commas preserved as separate fields (i.e. like split
2099does if your split pattern has capturing parentheses), you would
2100just make the last parameter undefined (or remove it).
2101
2102
2103=head2 C<gen_delimited_pat>
2104
2105The C<gen_delimited_pat> subroutine takes a single (string) argument and
2106 > builds a Friedl-style optimized regex that matches a string delimited
2107by any one of the characters in the single argument. For example:
2108
2109 gen_delimited_pat(q{'"})
2110
2111returns the regex:
2112
2113 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2114
2115Note that the specified delimiters are automatically quotemeta'd.
2116
2117A typical use of C<gen_delimited_pat> would be to build special purpose tags
2118for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2119(which might contain quoted strings):
2120
2121 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2122
2123 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2124
2125
2126C<gen_delimited_pat> may also be called with an optional second argument,
2127which specifies the "escape" character(s) to be used for each delimiter.
2128For example to match a Pascal-style string (where ' is the delimiter
2129and '' is a literal ' within the string):
2130
2131 gen_delimited_pat(q{'},q{'});
2132
2133Different escape characters can be specified for different delimiters.
2134For example, to specify that '/' is the escape for single quotes
2135and '%' is the escape for double quotes:
2136
2137 gen_delimited_pat(q{'"},q{/%});
2138
2139If more delimiters than escape chars are specified, the last escape char
2140is used for the remaining delimiters.
2141If no escape char is specified for a given specified delimiter, '\' is used.
2142
2143Note that
2144C<gen_delimited_pat> was previously called
2145C<delimited_pat>. That name may still be used, but is now deprecated.
2146
2147
2148=head1 DIAGNOSTICS
2149
2150In a list context, all the functions return C<(undef,$original_text)>
2151on failure. In a scalar context, failure is indicated by returning C<undef>
2152(in this case the input text is not modified in any way).
2153
2154In addition, on failure in I<any> context, the C<$@> variable is set.
2155Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2156below.
2157Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2158which the error was detected (although not necessarily where it occurred!)
2159Printing C<$@> directly produces the error message, with the offset appended.
2160On success, the C<$@> variable is guaranteed to be C<undef>.
2161
2162The available diagnostics are:
2163
2164=over 4
2165
2166=item C<Did not find a suitable bracket: "%s">
2167
2168The delimiter provided to C<extract_bracketed> was not one of
2169C<'()[]E<lt>E<gt>{}'>.
2170
2171=item C<Did not find prefix: /%s/>
2172
2173A non-optional prefix was specified but wasn't found at the start of the text.
2174
2175=item C<Did not find opening bracket after prefix: "%s">
2176
2177C<extract_bracketed> or C<extract_codeblock> was expecting a
2178particular kind of bracket at the start of the text, and didn't find it.
2179
2180=item C<No quotelike operator found after prefix: "%s">
2181
2182C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2183C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2184it was extracting.
2185
2186=item C<Unmatched closing bracket: "%c">
2187
2188C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2189a closing bracket where none was expected.
2190
2191=item C<Unmatched opening bracket(s): "%s">
2192
2193C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2194out of characters in the text before closing one or more levels of nested
2195brackets.
2196
2197=item C<Unmatched embedded quote (%s)>
2198
2199C<extract_bracketed> attempted to match an embedded quoted substring, but
2200failed to find a closing quote to match it.
2201
2202=item C<Did not find closing delimiter to match '%s'>
2203
2204C<extract_quotelike> was unable to find a closing delimiter to match the
2205one that opened the quote-like operation.
2206
2207=item C<Mismatched closing bracket: expected "%c" but found "%s">
2208
2209C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2210a valid bracket delimiter, but it was the wrong species. This usually
2211indicates a nesting error, but may indicate incorrect quoting or escaping.
2212
2213=item C<No block delimiter found after quotelike "%s">
2214
2215C<extract_quotelike> or C<extract_codeblock> found one of the
2216quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2217without a suitable block after it.
2218
2219=item C<Did not find leading dereferencer>
2220
2221C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2222a variable, but didn't find any of them.
2223
2224=item C<Bad identifier after dereferencer>
2225
2226C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2227character was not followed by a legal Perl identifier.
2228
2229=item C<Did not find expected opening bracket at %s>
2230
2231C<extract_codeblock> failed to find any of the outermost opening brackets
2232that were specified.
2233
2234=item C<Improperly nested codeblock at %s>
2235
2236A nested code block was found that started with a delimiter that was specified
2237as being only to be used as an outermost bracket.
2238
2239=item C<Missing second block for quotelike "%s">
2240
2241C<extract_codeblock> or C<extract_quotelike> found one of the
2242quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2243
2244=item C<No match found for opening bracket>
2245
2246C<extract_codeblock> failed to find a closing bracket to match the outermost
2247opening bracket.
2248
2249=item C<Did not find opening tag: /%s/>
2250
2251C<extract_tagged> did not find a suitable opening tag (after any specified
2252prefix was removed).
2253
2254=item C<Unable to construct closing tag to match: /%s/>
2255
2256C<extract_tagged> matched the specified opening tag and tried to
2257modify the matched text to produce a matching closing tag (because
2258none was specified). It failed to generate the closing tag, almost
2259certainly because the opening tag did not start with a
2260bracket of some kind.
2261
2262=item C<Found invalid nested tag: %s>
2263
2264C<extract_tagged> found a nested tag that appeared in the "reject" list
2265(and the failure mode was not "MAX" or "PARA").
2266
2267=item C<Found unbalanced nested tag: %s>
2268
2269C<extract_tagged> found a nested opening tag that was not matched by a
2270corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2271
2272=item C<Did not find closing tag>
2273
2274C<extract_tagged> reached the end of the text without finding a closing tag
2275to match the original opening tag (and the failure mode was not
2276"MAX" or "PARA").
2277
2278
2279
2280
2281=back
2282
2283
2284=head1 AUTHOR
2285
2286Damian Conway ([email protected])
2287
2288
2289=head1 BUGS AND IRRITATIONS
2290
2291There are undoubtedly serious bugs lurking somewhere in this code, if
2292only because parts of it give the impression of understanding a great deal
2293more about Perl than they really do.
2294
2295Bug reports and other feedback are most welcome.
2296
2297
2298=head1 COPYRIGHT
2299
2300 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2301 This module is free software. It may be used, redistributed
2302 and/or modified under the same terms as Perl itself.
Note: See TracBrowser for help on using the repository browser.