Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: for-distributions/trunk/bin/windows/perl/lib/Text/Balanced.pm@ 14489

Last change on this file since 14489 was 14489, checked in by oranfry, 17 years ago
upgrading to perl 5.8
File size: 65.8 KB

Line
1	# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2	# FOR FULL DOCUMENTATION SEE Balanced.pod
3
4	use 5.005;
5	use strict;
6
7	package Text::Balanced;
8
9	use Exporter;
10	use SelfLoader;
11	use vars qw { $VERSION @ISA %EXPORT_TAGS };
12
13	$VERSION = '1.95';
14	@ISA = qw ( Exporter );
15
16	%EXPORT_TAGS = ( ALL => [ qw(
17	&extract_delimited
18	&extract_bracketed
19	&extract_quotelike
20	&extract_codeblock
21	&extract_variable
22	&extract_tagged
23	&extract_multiple
24
25	&gen_delimited_pat
26	&gen_extract_tagged
27
28	&delimited_pat
29	) ] );
30
31	Exporter::export_ok_tags('ALL');
32
33	# PROTOTYPES
34
35	sub _match_bracketed($$$$$$);
36	sub _match_variable($$);
37	sub _match_codeblock($$$$$$$);
38	sub _match_quotelike($$$$);
39
40	# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41
42	sub _failmsg {
43	my ($message, $pos) = @_;
44	$@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45	}
46
47	sub _fail
48	{
49	my ($wantarray, $textref, $message, $pos) = @_;
50	_failmsg $message, $pos if $message;
51	return ("",$$textref,"") if $wantarray;
52	return undef;
53	}
54
55	sub _succeed
56	{
57	$@ = undef;
58	my ($wantarray,$textref) = splice @_, 0, 2;
59	my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60	my ($startlen) = $_[5];
61	my $remainderpos = $_[2];
62	if ($wantarray)
63	{
64	my @res;
65	while (my ($from, $len) = splice @_, 0, 2)
66	{
67	push @res, substr($$textref,$from,$len);
68	}
69	if ($extralen) { # CORRECT FILLET
70	my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71	$res[1] = "$extra$res[1]";
72	eval { substr($$textref,$remainderpos,0) = $extra;
73	substr($$textref,$extrapos,$extralen,"\n")} ;
74	#REARRANGE HERE DOC AND FILLET IF POSSIBLE
75	pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76	}
77	else {
78	pos($$textref) = $remainderpos; # RESET \G
79	}
80	return @res;
81	}
82	else
83	{
84	my $match = substr($$textref,$_[0],$_[1]);
85	substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86	my $extra = $extralen
87	? substr($$textref, $extrapos, $extralen)."\n" : "";
88	eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
89	pos($$textref) = $_[4]; # RESET \G
90	return $match;
91	}
92	}
93
94	# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95
96	sub gen_delimited_pat($;$) # ($delimiters;$escapes)
97	{
98	my ($dels, $escs) = @_;
99	return "" unless $dels =~ /\S/;
100	$escs = '\\' unless $escs;
101	$escs .= substr($escs,-1) x (length($dels)-length($escs));
102	my @pat = ();
103	my $i;
104	for ($i=0; $i<length $dels; $i++)
105	{
106	my $del = quotemeta substr($dels,$i,1);
107	my $esc = quotemeta substr($escs,$i,1);
108	if ($del eq $esc)
109	{
110	push @pat, "$del(?:[^$del](?:(?:$del$del)[^$del])*)$del";
111	}
112	else
113	{
114	push @pat, "$del(?:[^$esc$del](?:$esc.[^$esc$del])*)$del";
115	}
116	}
117	my $pat = join '\|', @pat;
118	return "(?:$pat)";
119	}
120
121	*delimited_pat = \&gen_delimited_pat;
122
123
124	# THE EXTRACTION FUNCTIONS
125
126	sub extract_delimited (;$$$$)
127	{
128	my $textref = defined $_[0] ? \$_[0] : \$_;
129	my $wantarray = wantarray;
130	my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
131	my $pre = defined $_[2] ? $_[2] : '\s*';
132	my $esc = defined $_[3] ? $_[3] : qq{\\};
133	my $pat = gen_delimited_pat($del, $esc);
134	my $startpos = pos $$textref \|\| 0;
135	return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136	unless $$textref =~ m/\G($pre)($pat)/gc;
137	my $prelen = length($1);
138	my $matchpos = $startpos+$prelen;
139	my $endpos = pos $$textref;
140	return _succeed $wantarray, $textref,
141	$matchpos, $endpos-$matchpos, # MATCH
142	$endpos, length($$textref)-$endpos, # REMAINDER
143	$startpos, $prelen; # PREFIX
144	}
145
146	sub extract_bracketed (;$$$)
147	{
148	my $textref = defined $_[0] ? \$_[0] : \$_;
149	my $ldel = defined $_[1] ? $_[1] : '{([<';
150	my $pre = defined $_[2] ? $_[2] : '\s*';
151	my $wantarray = wantarray;
152	my $qdel = "";
153	my $quotelike;
154	$ldel =~ s/'//g and $qdel .= q{'};
155	$ldel =~ s/"//g and $qdel .= q{"};
156	$ldel =~ s/`//g and $qdel .= q{`};
157	$ldel =~ s/q//g and $quotelike = 1;
158	$ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159	my $rdel = $ldel;
160	unless ($rdel =~ tr/[({</])}>/)
161	{
162	return _fail $wantarray, $textref,
163	"Did not find a suitable bracket in delimiter: \"$_[1]\"",
164	0;
165	}
166	my $posbug = pos;
167	$ldel = join('\|', map { quotemeta $_ } split('', $ldel));
168	$rdel = join('\|', map { quotemeta $_ } split('', $rdel));
169	pos = $posbug;
170
171	my $startpos = pos $$textref \|\| 0;
172	my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173
174	return _fail ($wantarray, $textref) unless @match;
175
176	return _succeed ( $wantarray, $textref,
177	$match[2], $match[5]+2, # MATCH
178	@match[8,9], # REMAINDER
179	@match[0,1], # PREFIX
180	);
181	}
182
183	sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184	{
185	my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186	my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref\|\|0);
187	unless ($$textref =~ m/\G$pre/gc)
188	{
189	_failmsg "Did not find prefix: /$pre/", $startpos;
190	return;
191	}
192
193	$ldelpos = pos $$textref;
194
195	unless ($$textref =~ m/\G($ldel)/gc)
196	{
197	_failmsg "Did not find opening bracket after prefix: \"$pre\"",
198	pos $$textref;
199	pos $$textref = $startpos;
200	return;
201	}
202
203	my @nesting = ( $1 );
204	my $textlen = length $$textref;
205	while (pos $$textref < $textlen)
206	{
207	next if $$textref =~ m/\G\\./gcs;
208
209	if ($$textref =~ m/\G($ldel)/gc)
210	{
211	push @nesting, $1;
212	}
213	elsif ($$textref =~ m/\G($rdel)/gc)
214	{
215	my ($found, $brackettype) = ($1, $1);
216	if ($#nesting < 0)
217	{
218	_failmsg "Unmatched closing bracket: \"$found\"",
219	pos $$textref;
220	pos $$textref = $startpos;
221	return;
222	}
223	my $expected = pop(@nesting);
224	$expected =~ tr/({[</)}]>/;
225	if ($expected ne $brackettype)
226	{
227	_failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228	pos $$textref;
229	pos $$textref = $startpos;
230	return;
231	}
232	last if $#nesting < 0;
233	}
234	elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235	{
236	$$textref =~ m/\G[^\\$1](?:\\.[^\\$1])*(\Q$1\E)/gsc and next;
237	_failmsg "Unmatched embedded quote ($1)",
238	pos $$textref;
239	pos $$textref = $startpos;
240	return;
241	}
242	elsif ($quotelike && _match_quotelike($textref,"",1,0))
243	{
244	next;
245	}
246
247	else { $$textref =~ m/\G(?:[a-zA-Z0-9]+\|.)/gcs }
248	}
249	if ($#nesting>=0)
250	{
251	_failmsg "Unmatched opening bracket(s): "
252	. join("..",@nesting)."..",
253	pos $$textref;
254	pos $$textref = $startpos;
255	return;
256	}
257
258	$endpos = pos $$textref;
259
260	return (
261	$startpos, $ldelpos-$startpos, # PREFIX
262	$ldelpos, 1, # OPENING BRACKET
263	$ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
264	$endpos-1, 1, # CLOSING BRACKET
265	$endpos, length($$textref)-$endpos, # REMAINDER
266	);
267	}
268
269	sub revbracket($)
270	{
271	my $brack = reverse $_[0];
272	$brack =~ tr/[({</])}>/;
273	return $brack;
274	}
275
276	my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277
278	sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279	{
280	my $textref = defined $_[0] ? \$_[0] : \$_;
281	my $ldel = $_[1];
282	my $rdel = $_[2];
283	my $pre = defined $_[3] ? $_[3] : '\s*';
284	my %options = defined $_[4] ? %{$_[4]} : ();
285	my $omode = defined $options{fail} ? $options{fail} : '';
286	my $bad = ref($options{reject}) eq 'ARRAY' ? join('\|', @{$options{reject}})
287	: defined($options{reject}) ? $options{reject}
288	: ''
289	;
290	my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('\|', @{$options{ignore}})
291	: defined($options{ignore}) ? $options{ignore}
292	: ''
293	;
294
295	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '\|[^>])*>'; }
296	$@ = undef;
297
298	my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299
300	return _fail(wantarray, $textref) unless @match;
301	return _succeed wantarray, $textref,
302	$match[2], $match[3]+$match[5]+$match[7], # MATCH
303	@match[8..9,0..1,2..7]; # REM, PRE, BITS
304	}
305
306	sub _match_tagged # ($$$$$$$)
307	{
308	my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309	my $rdelspec;
310
311	my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)\|\|0 );
312
313	unless ($$textref =~ m/\G($pre)/gc)
314	{
315	_failmsg "Did not find prefix: /$pre/", pos $$textref;
316	goto failed;
317	}
318
319	$opentagpos = pos($$textref);
320
321	unless ($$textref =~ m/\G$ldel/gc)
322	{
323	_failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324	goto failed;
325	}
326
327	$textpos = pos($$textref);
328
329	if (!defined $rdel)
330	{
331	$rdelspec = $&;
332	unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333	{
334	_failmsg "Unable to construct closing tag to match: $rdel",
335	pos $$textref;
336	goto failed;
337	}
338	}
339	else
340	{
341	$rdelspec = eval "qq{$rdel}" \|\| do {
342	my $del;
343	for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / \| ',)
344	{ next if $rdel =~ /\Q$_/; $del = $_; last }
345	unless ($del) {
346	use Carp;
347	croak "Can't interpolate right delimiter $rdel"
348	}
349	eval "qq$del$rdel$del";
350	};
351	}
352
353	while (pos($$textref) < length($$textref))
354	{
355	next if $$textref =~ m/\G\\./gc;
356
357	if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
358	{
359	$parapos = pos($$textref) - length($1)
360	unless defined $parapos;
361	}
362	elsif ($$textref =~ m/\G($rdelspec)/gc )
363	{
364	$closetagpos = pos($$textref)-length($1);
365	goto matched;
366	}
367	elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
368	{
369	next;
370	}
371	elsif ($bad && $$textref =~ m/\G($bad)/gcs)
372	{
373	pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
374	goto short if ($omode eq 'PARA' \|\| $omode eq 'MAX');
375	_failmsg "Found invalid nested tag: $1", pos $$textref;
376	goto failed;
377	}
378	elsif ($$textref =~ m/\G($ldel)/gc)
379	{
380	my $tag = $1;
381	pos($$textref) -= length($tag); # REWIND TO NESTED TAG
382	unless (_match_tagged(@_)) # MATCH NESTED TAG
383	{
384	goto short if $omode eq 'PARA' \|\| $omode eq 'MAX';
385	_failmsg "Found unbalanced nested tag: $tag",
386	pos $$textref;
387	goto failed;
388	}
389	}
390	else { $$textref =~ m/./gcs }
391	}
392
393	short:
394	$closetagpos = pos($$textref);
395	goto matched if $omode eq 'MAX';
396	goto failed unless $omode eq 'PARA';
397
398	if (defined $parapos) { pos($$textref) = $parapos }
399	else { $parapos = pos($$textref) }
400
401	return (
402	$startpos, $opentagpos-$startpos, # PREFIX
403	$opentagpos, $textpos-$opentagpos, # OPENING TAG
404	$textpos, $parapos-$textpos, # TEXT
405	$parapos, 0, # NO CLOSING TAG
406	$parapos, length($$textref)-$parapos, # REMAINDER
407	);
408
409	matched:
410	$endpos = pos($$textref);
411	return (
412	$startpos, $opentagpos-$startpos, # PREFIX
413	$opentagpos, $textpos-$opentagpos, # OPENING TAG
414	$textpos, $closetagpos-$textpos, # TEXT
415	$closetagpos, $endpos-$closetagpos, # CLOSING TAG
416	$endpos, length($$textref)-$endpos, # REMAINDER
417	);
418
419	failed:
420	_failmsg "Did not find closing tag", pos $$textref unless $@;
421	pos($$textref) = $startpos;
422	return;
423	}
424
425	sub extract_variable (;$$)
426	{
427	my $textref = defined $_[0] ? \$_[0] : \$_;
428	return ("","","") unless defined $$textref;
429	my $pre = defined $_[1] ? $_[1] : '\s*';
430
431	my @match = _match_variable($textref,$pre);
432
433	return _fail wantarray, $textref unless @match;
434
435	return _succeed wantarray, $textref,
436	@match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
437	}
438
439	sub _match_variable($$)
440	{
441	# $#
442	# $^
443	# $$
444	my ($textref, $pre) = @_;
445	my $startpos = pos($$textref) = pos($$textref)\|\|0;
446	unless ($$textref =~ m/\G($pre)/gc)
447	{
448	_failmsg "Did not find prefix: /$pre/", pos $$textref;
449	return;
450	}
451	my $varpos = pos($$textref);
452	unless ($$textref =~ m{\G\$\s(?!::)(\d+\|[][&`'+./\|,";%=~:?!\@<>()-]\|\^[a-z]?)}gci)
453	{
454	unless ($$textref =~ m/\G((\$#?\|[*\@\%]\|\\&)+)/gc)
455	{
456	_failmsg "Did not find leading dereferencer", pos $$textref;
457	pos $$textref = $startpos;
458	return;
459	}
460	my $deref = $1;
461
462	unless ($$textref =~ m/\G\s(?:::\|')?(?:[_a-z]\w(?:::\|'))[_a-z]\w/gci
463	or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
464	or $deref eq '$#' or $deref eq '$$' )
465	{
466	_failmsg "Bad identifier after dereferencer", pos $$textref;
467	pos $$textref = $startpos;
468	return;
469	}
470	}
471
472	while (1)
473	{
474	next if $$textref =~ m/\G\s(?:->)?\s[{]\w+[}]/gc;
475	next if _match_codeblock($textref,
476	qr/\s->\s(?:[_a-zA-Z]\w+\s*)?/,
477	qr/[({[]/, qr/[)}\]]/,
478	qr/[({[]/, qr/[)}\]]/, 0);
479	next if _match_codeblock($textref,
480	qr/\s*/, qr/[{[]/, qr/[}\]]/,
481	qr/[{[]/, qr/[}\]]/, 0);
482	next if _match_variable($textref,'\s->\s');
483	next if $$textref =~ m/\G\s->\s\w+(?![{([])/gc;
484	last;
485	}
486
487	my $endpos = pos($$textref);
488	return ($startpos, $varpos-$startpos,
489	$varpos, $endpos-$varpos,
490	$endpos, length($$textref)-$endpos
491	);
492	}
493
494	sub extract_codeblock (;$$$$$)
495	{
496	my $textref = defined $_[0] ? \$_[0] : \$_;
497	my $wantarray = wantarray;
498	my $ldel_inner = defined $_[1] ? $_[1] : '{';
499	my $pre = defined $_[2] ? $_[2] : '\s*';
500	my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
501	my $rd = $_[4];
502	my $rdel_inner = $ldel_inner;
503	my $rdel_outer = $ldel_outer;
504	my $posbug = pos;
505	for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
506	for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
507	for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
508	{
509	$_ = '('.join('\|',map { quotemeta $_ } split('',$_)).')'
510	}
511	pos = $posbug;
512
513	my @match = _match_codeblock($textref, $pre,
514	$ldel_outer, $rdel_outer,
515	$ldel_inner, $rdel_inner,
516	$rd);
517	return _fail($wantarray, $textref) unless @match;
518	return _succeed($wantarray, $textref,
519	@match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
520	);
521
522	}
523
524	sub _match_codeblock($$$$$$$)
525	{
526	my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
527	my $startpos = pos($$textref) = pos($$textref) \|\| 0;
528	unless ($$textref =~ m/\G($pre)/gc)
529	{
530	_failmsg qq{Did not match prefix /$pre/ at"} .
531	substr($$textref,pos($$textref),20) .
532	q{..."},
533	pos $$textref;
534	return;
535	}
536	my $codepos = pos($$textref);
537	unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
538	{
539	_failmsg qq{Did not find expected opening bracket at "} .
540	substr($$textref,pos($$textref),20) .
541	q{..."},
542	pos $$textref;
543	pos $$textref = $startpos;
544	return;
545	}
546	my $closing = $1;
547	$closing =~ tr/([<{/)]>}/;
548	my $matched;
549	my $patvalid = 1;
550	while (pos($$textref) < length($$textref))
551	{
552	$matched = '';
553	if ($rd && $$textref =~ m#\G(\Q(?)\E\|\Q(s?)\E\|\Q(s)\E)#gc)
554	{
555	$patvalid = 0;
556	next;
557	}
558
559	if ($$textref =~ m/\G\s#./gc)
560	{
561	next;
562	}
563
564	if ($$textref =~ m/\G\s*($rdel_outer)/gc)
565	{
566	unless ($matched = ($closing && $1 eq $closing) )
567	{
568	next if $1 eq '>'; # MIGHT BE A "LESS THAN"
569	_failmsg q{Mismatched closing bracket at "} .
570	substr($$textref,pos($$textref),20) .
571	qq{...". Expected '$closing'},
572	pos $$textref;
573	}
574	last;
575	}
576
577	if (_match_variable($textref,'\s*') \|\|
578	_match_quotelike($textref,'\s*',$patvalid,$patvalid) )
579	{
580	$patvalid = 0;
581	next;
582	}
583
584
585	# NEED TO COVER MANY MORE CASES HERE!!!
586	if ($$textref =~ m#\G\s*(?!$ldel_inner)
587	( [-+*x/%^&\|.]=?
588	\| [!=]~
589	\| =(?!>)
590	\| (\\\|&&\|\\|\\|\|<<\|>>)=?
591	\| split\|grep\|map\|return
592	\| [([]
593	)#gcx)
594	{
595	$patvalid = 1;
596	next;
597	}
598
599	if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
600	{
601	$patvalid = 1;
602	next;
603	}
604
605	if ($$textref =~ m/\G\s*$ldel_outer/gc)
606	{
607	_failmsg q{Improperly nested codeblock at "} .
608	substr($$textref,pos($$textref),20) .
609	q{..."},
610	pos $$textref;
611	last;
612	}
613
614	$patvalid = 0;
615	$$textref =~ m/\G\s*(\w+\|[-=>]>\|.\|\Z)/gc;
616	}
617	continue { $@ = undef }
618
619	unless ($matched)
620	{
621	_failmsg 'No match found for opening bracket', pos $$textref
622	unless $@;
623	return;
624	}
625
626	my $endpos = pos($$textref);
627	return ( $startpos, $codepos-$startpos,
628	$codepos, $endpos-$codepos,
629	$endpos, length($$textref)-$endpos,
630	);
631	}
632
633
634	my %mods = (
635	'none' => '[cgimsox]*',
636	'm' => '[cgimsox]*',
637	's' => '[cegimsox]*',
638	'tr' => '[cds]*',
639	'y' => '[cds]*',
640	'qq' => '',
641	'qx' => '',
642	'qw' => '',
643	'qr' => '[imsx]*',
644	'q' => '',
645	);
646
647	sub extract_quotelike (;$$)
648	{
649	my $textref = $_[0] ? \$_[0] : \$_;
650	my $wantarray = wantarray;
651	my $pre = defined $_[1] ? $_[1] : '\s*';
652
653	my @match = _match_quotelike($textref,$pre,1,0);
654	return _fail($wantarray, $textref) unless @match;
655	return _succeed($wantarray, $textref,
656	$match[2], $match[18]-$match[2], # MATCH
657	@match[18,19], # REMAINDER
658	@match[0,1], # PREFIX
659	@match[2..17], # THE BITS
660	@match[20,21], # ANY FILLET?
661	);
662	};
663
664	sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
665	{
666	my ($textref, $pre, $rawmatch, $qmark) = @_;
667
668	my ($textlen,$startpos,
669	$oppos,
670	$preld1pos,$ld1pos,$str1pos,$rd1pos,
671	$preld2pos,$ld2pos,$str2pos,$rd2pos,
672	$modpos) = ( length($$textref), pos($$textref) = pos($$textref) \|\| 0 );
673
674	unless ($$textref =~ m/\G($pre)/gc)
675	{
676	_failmsg qq{Did not find prefix /$pre/ at "} .
677	substr($$textref, pos($$textref), 20) .
678	q{..."},
679	pos $$textref;
680	return;
681	}
682	$oppos = pos($$textref);
683
684	my $initial = substr($$textref,$oppos,1);
685
686	if ($initial && $initial =~ m\|^[\"\'\`]\|
687	\|\| $rawmatch && $initial =~ m\|^/\|
688	\|\| $qmark && $initial =~ m\|^\?\|)
689	{
690	unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]) \Q$initial\E /gcsx)
691	{
692	_failmsg qq{Did not find closing delimiter to match '$initial' at "} .
693	substr($$textref, $oppos, 20) .
694	q{..."},
695	pos $$textref;
696	pos $$textref = $startpos;
697	return;
698	}
699	$modpos= pos($$textref);
700	$rd1pos = $modpos-1;
701
702	if ($initial eq '/' \|\| $initial eq '?')
703	{
704	$$textref =~ m/\G$mods{none}/gc
705	}
706
707	my $endpos = pos($$textref);
708	return (
709	$startpos, $oppos-$startpos, # PREFIX
710	$oppos, 0, # NO OPERATOR
711	$oppos, 1, # LEFT DEL
712	$oppos+1, $rd1pos-$oppos-1, # STR/PAT
713	$rd1pos, 1, # RIGHT DEL
714	$modpos, 0, # NO 2ND LDEL
715	$modpos, 0, # NO 2ND STR
716	$modpos, 0, # NO 2ND RDEL
717	$modpos, $endpos-$modpos, # MODIFIERS
718	$endpos, $textlen-$endpos, # REMAINDER
719	);
720	}
721
722	unless ($$textref =~ m{\G(\b(?:m\|s\|qq\|qx\|qw\|q\|qr\|tr\|y)\b(?=\s*\S)\|<<)}gc)
723	{
724	_failmsg q{No quotelike operator found after prefix at "} .
725	substr($$textref, pos($$textref), 20) .
726	q{..."},
727	pos $$textref;
728	pos $$textref = $startpos;
729	return;
730	}
731
732	my $op = $1;
733	$preld1pos = pos($$textref);
734	if ($op eq '<<') {
735	$ld1pos = pos($$textref);
736	my $label;
737	if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
738	$label = $1;
739	}
740	elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\])) '
741	\| \G " ([^"\\]* (?:\\.[^"\\])) "
742	\| \G ` ([^`\\]* (?:\\.[^`\\])) `
743	}gcsx) {
744	$label = $+;
745	}
746	else {
747	$label = "";
748	}
749	my $extrapos = pos($$textref);
750	$$textref =~ m{.*\n}gc;
751	$str1pos = pos($$textref);
752	unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
753	_failmsg qq{Missing here doc terminator ('$label') after "} .
754	substr($$textref, $startpos, 20) .
755	q{..."},
756	pos $$textref;
757	pos $$textref = $startpos;
758	return;
759	}
760	$rd1pos = pos($$textref);
761	$$textref =~ m{$label\n}gc;
762	$ld2pos = pos($$textref);
763	return (
764	$startpos, $oppos-$startpos, # PREFIX
765	$oppos, length($op), # OPERATOR
766	$ld1pos, $extrapos-$ld1pos, # LEFT DEL
767	$str1pos, $rd1pos-$str1pos, # STR/PAT
768	$rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
769	$ld2pos, 0, # NO 2ND LDEL
770	$ld2pos, 0, # NO 2ND STR
771	$ld2pos, 0, # NO 2ND RDEL
772	$ld2pos, 0, # NO MODIFIERS
773	$ld2pos, $textlen-$ld2pos, # REMAINDER
774	$extrapos, $str1pos-$extrapos, # FILLETED BIT
775	);
776	}
777
778	$$textref =~ m/\G\s*/gc;
779	$ld1pos = pos($$textref);
780	$str1pos = $ld1pos+1;
781
782	unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
783	{
784	_failmsg "No block delimiter found after quotelike $op",
785	pos $$textref;
786	pos $$textref = $startpos;
787	return;
788	}
789	pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
790	my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
791	if ($ldel1 =~ /[[(<{]/)
792	{
793	$rdel1 =~ tr/[({</])}>/;
794	_match_bracketed($textref,"",$ldel1,"","",$rdel1)
795	\|\| do { pos $$textref = $startpos; return };
796	}
797	else
798	{
799	$$textref =~ /$ldel1[^\\$ldel1](\\.[^\\$ldel1])*$ldel1/gcs
800	\|\| do { pos $$textref = $startpos; return };
801	}
802	$ld2pos = $rd1pos = pos($$textref)-1;
803
804	my $second_arg = $op =~ /s\|tr\|y/ ? 1 : 0;
805	if ($second_arg)
806	{
807	my ($ldel2, $rdel2);
808	if ($ldel1 =~ /[[(<{]/)
809	{
810	unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
811	{
812	_failmsg "Missing second block for quotelike $op",
813	pos $$textref;
814	pos $$textref = $startpos;
815	return;
816	}
817	$ldel2 = $rdel2 = "\Q$1";
818	$rdel2 =~ tr/[({</])}>/;
819	}
820	else
821	{
822	$ldel2 = $rdel2 = $ldel1;
823	}
824	$str2pos = $ld2pos+1;
825
826	if ($ldel2 =~ /[[(<{]/)
827	{
828	pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
829	_match_bracketed($textref,"",$ldel2,"","",$rdel2)
830	\|\| do { pos $$textref = $startpos; return };
831	}
832	else
833	{
834	$$textref =~ /[^\\$ldel2](\\.[^\\$ldel2])*$ldel2/gcs
835	\|\| do { pos $$textref = $startpos; return };
836	}
837	$rd2pos = pos($$textref)-1;
838	}
839	else
840	{
841	$ld2pos = $str2pos = $rd2pos = $rd1pos;
842	}
843
844	$modpos = pos $$textref;
845
846	$$textref =~ m/\G($mods{$op})/gc;
847	my $endpos = pos $$textref;
848
849	return (
850	$startpos, $oppos-$startpos, # PREFIX
851	$oppos, length($op), # OPERATOR
852	$ld1pos, 1, # LEFT DEL
853	$str1pos, $rd1pos-$str1pos, # STR/PAT
854	$rd1pos, 1, # RIGHT DEL
855	$ld2pos, $second_arg, # 2ND LDEL (MAYBE)
856	$str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
857	$rd2pos, $second_arg, # 2ND RDEL (MAYBE)
858	$modpos, $endpos-$modpos, # MODIFIERS
859	$endpos, $textlen-$endpos, # REMAINDER
860	);
861	}
862
863	my $def_func =
864	[
865	sub { extract_variable($_[0], '') },
866	sub { extract_quotelike($_[0],'') },
867	sub { extract_codeblock($_[0],'{}','') },
868	];
869
870	sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
871	{
872	my $textref = defined($_[0]) ? \$_[0] : \$_;
873	my $posbug = pos;
874	my ($lastpos, $firstpos);
875	my @fields = ();
876
877	#for ($$textref)
878	{
879	my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
880	my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
881	my $igunk = $_[3];
882
883	pos $$textref \|\|= 0;
884
885	unless (wantarray)
886	{
887	use Carp;
888	carp "extract_multiple reset maximal count to 1 in scalar context"
889	if $^W && defined($_[2]) && $max > 1;
890	$max = 1
891	}
892
893	my $unkpos;
894	my $func;
895	my $class;
896
897	my @class;
898	foreach $func ( @func )
899	{
900	if (ref($func) eq 'HASH')
901	{
902	push @class, (keys %$func)[0];
903	$func = (values %$func)[0];
904	}
905	else
906	{
907	push @class, undef;
908	}
909	}
910
911	FIELD: while (pos($$textref) < length($$textref))
912	{
913	my ($field, $rem);
914	my @bits;
915	foreach my $i ( 0..$#func )
916	{
917	my $pref;
918	$func = $func[$i];
919	$class = $class[$i];
920	$lastpos = pos $$textref;
921	if (ref($func) eq 'CODE')
922	{ ($field,$rem,$pref) = @bits = $func->($$textref);
923	# print "[$field\|$rem]" if $field;
924	}
925	elsif (ref($func) eq 'Text::Balanced::Extractor')
926	{ @bits = $field = $func->extract($$textref) }
927	elsif( $$textref =~ m/\G$func/gc )
928	{ @bits = $field = defined($1) ? $1 : $& }
929	$pref \|\|= "";
930	if (defined($field) && length($field))
931	{
932	if (!$igunk) {
933	$unkpos = pos $$textref
934	if length($pref) && !defined($unkpos);
935	if (defined $unkpos)
936	{
937	push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
938	$firstpos = $unkpos unless defined $firstpos;
939	undef $unkpos;
940	last FIELD if @fields == $max;
941	}
942	}
943	push @fields, $class
944	? bless (\$field, $class)
945	: $field;
946	$firstpos = $lastpos unless defined $firstpos;
947	$lastpos = pos $$textref;
948	last FIELD if @fields == $max;
949	next FIELD;
950	}
951	}
952	if ($$textref =~ /\G(.)/gcs)
953	{
954	$unkpos = pos($$textref)-1
955	unless $igunk \|\| defined $unkpos;
956	}
957	}
958
959	if (defined $unkpos)
960	{
961	push @fields, substr($$textref, $unkpos);
962	$firstpos = $unkpos unless defined $firstpos;
963	$lastpos = length $$textref;
964	}
965	last;
966	}
967
968	pos $$textref = $lastpos;
969	return @fields if wantarray;
970
971	$firstpos \|\|= 0;
972	eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
973	pos $$textref = $firstpos };
974	return $fields[0];
975	}
976
977
978	sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
979	{
980	my $ldel = $_[0];
981	my $rdel = $_[1];
982	my $pre = defined $_[2] ? $_[2] : '\s*';
983	my %options = defined $_[3] ? %{$_[3]} : ();
984	my $omode = defined $options{fail} ? $options{fail} : '';
985	my $bad = ref($options{reject}) eq 'ARRAY' ? join('\|', @{$options{reject}})
986	: defined($options{reject}) ? $options{reject}
987	: ''
988	;
989	my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('\|', @{$options{ignore}})
990	: defined($options{ignore}) ? $options{ignore}
991	: ''
992	;
993
994	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '\|[^>])*>'; }
995
996	my $posbug = pos;
997	for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
998	pos = $posbug;
999
1000	my $closure = sub
1001	{
1002	my $textref = defined $_[0] ? \$_[0] : \$_;
1003	my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1004
1005	return _fail(wantarray, $textref) unless @match;
1006	return _succeed wantarray, $textref,
1007	$match[2], $match[3]+$match[5]+$match[7], # MATCH
1008	@match[8..9,0..1,2..7]; # REM, PRE, BITS
1009	};
1010
1011	bless $closure, 'Text::Balanced::Extractor';
1012	}
1013
1014	package Text::Balanced::Extractor;
1015
1016	sub extract($$) # ($self, $text)
1017	{
1018	&{$_[0]}($_[1]);
1019	}
1020
1021	package Text::Balanced::ErrorMsg;
1022
1023	use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1024
1025	1;
1026
1027	__END__
1028
1029	=head1 NAME
1030
1031	Text::Balanced - Extract delimited text sequences from strings.
1032
1033
1034	=head1 SYNOPSIS
1035
1036	use Text::Balanced qw (
1037	extract_delimited
1038	extract_bracketed
1039	extract_quotelike
1040	extract_codeblock
1041	extract_variable
1042	extract_tagged
1043	extract_multiple
1044
1045	gen_delimited_pat
1046	gen_extract_tagged
1047	);
1048
1049	# Extract the initial substring of $text that is delimited by
1050	# two (unescaped) instances of the first character in $delim.
1051
1052	($extracted, $remainder) = extract_delimited($text,$delim);
1053
1054
1055	# Extract the initial substring of $text that is bracketed
1056	# with a delimiter(s) specified by $delim (where the string
1057	# in $delim contains one or more of '(){}[]<>').
1058
1059	($extracted, $remainder) = extract_bracketed($text,$delim);
1060
1061
1062	# Extract the initial substring of $text that is bounded by
1063	# an XML tag.
1064
1065	($extracted, $remainder) = extract_tagged($text);
1066
1067
1068	# Extract the initial substring of $text that is bounded by
1069	# a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1070
1071	($extracted, $remainder) =
1072	extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1073
1074
1075	# Extract the initial substring of $text that represents a
1076	# Perl "quote or quote-like operation"
1077
1078	($extracted, $remainder) = extract_quotelike($text);
1079
1080
1081	# Extract the initial substring of $text that represents a block
1082	# of Perl code, bracketed by any of character(s) specified by $delim
1083	# (where the string $delim contains one or more of '(){}[]<>').
1084
1085	($extracted, $remainder) = extract_codeblock($text,$delim);
1086
1087
1088	# Extract the initial substrings of $text that would be extracted by
1089	# one or more sequential applications of the specified functions
1090	# or regular expressions
1091
1092	@extracted = extract_multiple($text,
1093	[ \&extract_bracketed,
1094	\&extract_quotelike,
1095	\&some_other_extractor_sub,
1096	qr/[xyz]*/,
1097	'literal',
1098	]);
1099
1100	# Create a string representing an optimized pattern (a la Friedl)
1101	# that matches a substring delimited by any of the specified characters
1102	# (in this case: any type of quote or a slash)
1103
1104	$patstring = gen_delimited_pat(q{'"`/});
1105
1106
1107	# Generate a reference to an anonymous sub that is just like extract_tagged
1108	# but pre-compiled and optimized for a specific pair of tags, and consequently
1109	# much faster (i.e. 3 times faster). It uses qr// for better performance on
1110	# repeated calls, so it only works under Perl 5.005 or later.
1111
1112	$extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1113
1114	($extracted, $remainder) = $extract_head->($text);
1115
1116
1117	=head1 DESCRIPTION
1118
1119	The various C<extract_...> subroutines may be used to
1120	extract a delimited substring, possibly after skipping a
1121	specified prefix string. By default, that prefix is
1122	optional whitespace (C</\s*/>), but you can change it to whatever
1123	you wish (see below).
1124
1125	The substring to be extracted must appear at the
1126	current C<pos> location of the string's variable
1127	(or at index zero, if no C<pos> position is defined).
1128	In other words, the C<extract_...> subroutines I<don't>
1129	extract the first occurance of a substring anywhere
1130	in a string (like an unanchored regex would). Rather,
1131	they extract an occurance of the substring appearing
1132	immediately at the current matching position in the
1133	string (like a C<\G>-anchored regex would).
1134
1135
1136
1137	=head2 General behaviour in list contexts
1138
1139	In a list context, all the subroutines return a list, the first three
1140	elements of which are always:
1141
1142	=over 4
1143
1144	=item [0]
1145
1146	The extracted string, including the specified delimiters.
1147	If the extraction fails an empty string is returned.
1148
1149	=item [1]
1150
1151	The remainder of the input string (i.e. the characters after the
1152	extracted string). On failure, the entire string is returned.
1153
1154	=item [2]
1155
1156	The skipped prefix (i.e. the characters before the extracted string).
1157	On failure, the empty string is returned.
1158
1159	=back
1160
1161	Note that in a list context, the contents of the original input text (the first
1162	argument) are not modified in any way.
1163
1164	However, if the input text was passed in a variable, that variable's
1165	C<pos> value is updated to point at the first character after the
1166	extracted text. That means that in a list context the various
1167	subroutines can be used much like regular expressions. For example:
1168
1169	while ( $next = (extract_quotelike($text))[0] )
1170	{
1171	# process next quote-like (in $next)
1172	}
1173
1174
1175	=head2 General behaviour in scalar and void contexts
1176
1177	In a scalar context, the extracted string is returned, having first been
1178	removed from the input text. Thus, the following code also processes
1179	each quote-like operation, but actually removes them from $text:
1180
1181	while ( $next = extract_quotelike($text) )
1182	{
1183	# process next quote-like (in $next)
1184	}
1185
1186	Note that if the input text is a read-only string (i.e. a literal),
1187	no attempt is made to remove the extracted text.
1188
1189	In a void context the behaviour of the extraction subroutines is
1190	exactly the same as in a scalar context, except (of course) that the
1191	extracted substring is not returned.
1192
1193	=head2 A note about prefixes
1194
1195	Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1196	This can bite you if you're expecting a prefix specification like
1197	'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1198	pattern will only succeed if the <H1> tag is on the current line, since
1199	. normally doesn't match newlines.
1200
1201	To overcome this limitation, you need to turn on /s matching within
1202	the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1203
1204
1205	=head2 C<extract_delimited>
1206
1207	The C<extract_delimited> function formalizes the common idiom
1208	of extracting a single-character-delimited substring from the start of
1209	a string. For example, to extract a single-quote delimited string, the
1210	following code is typically used:
1211
1212	($remainder = $text) =~ s/\A('(\\.\|[^'])*')//s;
1213	$extracted = $1;
1214
1215	but with C<extract_delimited> it can be simplified to:
1216
1217	($extracted,$remainder) = extract_delimited($text, "'");
1218
1219	C<extract_delimited> takes up to four scalars (the input text, the
1220	delimiters, a prefix pattern to be skipped, and any escape characters)
1221	and extracts the initial substring of the text that
1222	is appropriately delimited. If the delimiter string has multiple
1223	characters, the first one encountered in the text is taken to delimit
1224	the substring.
1225	The third argument specifies a prefix pattern that is to be skipped
1226	(but must be present!) before the substring is extracted.
1227	The final argument specifies the escape character to be used for each
1228	delimiter.
1229
1230	All arguments are optional. If the escape characters are not specified,
1231	every delimiter is escaped with a backslash (C<\>).
1232	If the prefix is not specified, the
1233	pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1234	is also not specified, the set C</["'`]/> is used. If the text to be processed
1235	is not specified either, C<$_> is used.
1236
1237	In list context, C<extract_delimited> returns a array of three
1238	elements, the extracted substring (I<including the surrounding
1239	delimiters>), the remainder of the text, and the skipped prefix (if
1240	any). If a suitable delimited substring is not found, the first
1241	element of the array is the empty string, the second is the complete
1242	original text, and the prefix returned in the third element is an
1243	empty string.
1244
1245	In a scalar context, just the extracted substring is returned. In
1246	a void context, the extracted substring (and any prefix) are simply
1247	removed from the beginning of the first argument.
1248
1249	Examples:
1250
1251	# Remove a single-quoted substring from the very beginning of $text:
1252
1253	$substring = extract_delimited($text, "'", '');
1254
1255	# Remove a single-quoted Pascalish substring (i.e. one in which
1256	# doubling the quote character escapes it) from the very
1257	# beginning of $text:
1258
1259	$substring = extract_delimited($text, "'", '', "'");
1260
1261	# Extract a single- or double- quoted substring from the
1262	# beginning of $text, optionally after some whitespace
1263	# (note the list context to protect $text from modification):
1264
1265	($substring) = extract_delimited $text, q{"'};
1266
1267
1268	# Delete the substring delimited by the first '/' in $text:
1269
1270	$text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1271
1272	Note that this last example is I<not> the same as deleting the first
1273	quote-like pattern. For instance, if C<$text> contained the string:
1274
1275	"if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1276
1277	then after the deletion it would contain:
1278
1279	"if ('.$UNIXCMD/s) { $cmd = $1; }"
1280
1281	not:
1282
1283	"if ('./cmd' =~ ms) { $cmd = $1; }"
1284
1285
1286	See L<"extract_quotelike"> for a (partial) solution to this problem.
1287
1288
1289	=head2 C<extract_bracketed>
1290
1291	Like C<"extract_delimited">, the C<extract_bracketed> function takes
1292	up to three optional scalar arguments: a string to extract from, a delimiter
1293	specifier, and a prefix pattern. As before, a missing prefix defaults to
1294	optional whitespace and a missing text defaults to C<$_>. However, a missing
1295	delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1296
1297	C<extract_bracketed> extracts a balanced-bracket-delimited
1298	substring (using any one (or more) of the user-specified delimiter
1299	brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1300	respect quoted unbalanced brackets (see below).
1301
1302	A "delimiter bracket" is a bracket in list of delimiters passed as
1303	C<extract_bracketed>'s second argument. Delimiter brackets are
1304	specified by giving either the left or right (or both!) versions
1305	of the required bracket(s). Note that the order in which
1306	two or more delimiter brackets are specified is not significant.
1307
1308	A "balanced-bracket-delimited substring" is a substring bounded by
1309	matched brackets, such that any other (left or right) delimiter
1310	bracket I<within> the substring is also matched by an opposite
1311	(right or left) delimiter bracket I<at the same level of nesting>. Any
1312	type of bracket not in the delimiter list is treated as an ordinary
1313	character.
1314
1315	In other words, each type of bracket specified as a delimiter must be
1316	balanced and correctly nested within the substring, and any other kind of
1317	("non-delimiter") bracket in the substring is ignored.
1318
1319	For example, given the string:
1320
1321	$text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1322
1323	then a call to C<extract_bracketed> in a list context:
1324
1325	@result = extract_bracketed( $text, '{}' );
1326
1327	would return:
1328
1329	( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1330
1331	since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1332	(In a scalar context just the first element of the array would be returned. In
1333	a void context, C<$text> would be replaced by an empty string.)
1334
1335	Likewise the call in:
1336
1337	@result = extract_bracketed( $text, '{[' );
1338
1339	would return the same result, since all sets of both types of specified
1340	delimiter brackets are correctly nested and balanced.
1341
1342	However, the call in:
1343
1344	@result = extract_bracketed( $text, '{([<' );
1345
1346	would fail, returning:
1347
1348	( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
1349
1350	because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1351	the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1352	return an empty string. In a void context, C<$text> would be unchanged.)
1353
1354	Note that the embedded single-quotes in the string don't help in this
1355	case, since they have not been specified as acceptable delimiters and are
1356	therefore treated as non-delimiter characters (and ignored).
1357
1358	However, if a particular species of quote character is included in the
1359	delimiter specification, then that type of quote will be correctly handled.
1360	for example, if C<$text> is:
1361
1362	$text = '<A HREF=">>>>">link</A>';
1363
1364	then
1365
1366	@result = extract_bracketed( $text, '<">' );
1367
1368	returns:
1369
1370	( '<A HREF=">>>>">', 'link</A>', "" )
1371
1372	as expected. Without the specification of C<"> as an embedded quoter:
1373
1374	@result = extract_bracketed( $text, '<>' );
1375
1376	the result would be:
1377
1378	( '<A HREF=">', '>>>">link</A>', "" )
1379
1380	In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1381	quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1382	letter 'q' as a delimiter. Hence:
1383
1384	@result = extract_bracketed( $text, '<q>' );
1385
1386	would correctly match something like this:
1387
1388	$text = '<leftop: conj /and/ conj>';
1389
1390	See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1391
1392
1393	=head2 C<extract_variable>
1394
1395	C<extract_variable> extracts any valid Perl variable or
1396	variable-involved expression, including scalars, arrays, hashes, array
1397	accesses, hash look-ups, method calls through objects, subroutine calles
1398	through subroutine references, etc.
1399
1400	The subroutine takes up to two optional arguments:
1401
1402	=over 4
1403
1404	=item 1.
1405
1406	A string to be processed (C<$_> if the string is omitted or C<undef>)
1407
1408	=item 2.
1409
1410	A string specifying a pattern to be matched as a prefix (which is to be
1411	skipped). If omitted, optional whitespace is skipped.
1412
1413	=back
1414
1415	On success in a list context, an array of 3 elements is returned. The
1416	elements are:
1417
1418	=over 4
1419
1420	=item [0]
1421
1422	the extracted variable, or variablish expression
1423
1424	=item [1]
1425
1426	the remainder of the input text,
1427
1428	=item [2]
1429
1430	the prefix substring (if any),
1431
1432	=back
1433
1434	On failure, all of these values (except the remaining text) are C<undef>.
1435
1436	In a scalar context, C<extract_variable> returns just the complete
1437	substring that matched a variablish expression. C<undef> is returned on
1438	failure. In addition, the original input text has the returned substring
1439	(and any prefix) removed from it.
1440
1441	In a void context, the input text just has the matched substring (and
1442	any specified prefix) removed.
1443
1444
1445	=head2 C<extract_tagged>
1446
1447	C<extract_tagged> extracts and segments text between (balanced)
1448	specified tags.
1449
1450	The subroutine takes up to five optional arguments:
1451
1452	=over 4
1453
1454	=item 1.
1455
1456	A string to be processed (C<$_> if the string is omitted or C<undef>)
1457
1458	=item 2.
1459
1460	A string specifying a pattern to be matched as the opening tag.
1461	If the pattern string is omitted (or C<undef>) then a pattern
1462	that matches any standard XML tag is used.
1463
1464	=item 3.
1465
1466	A string specifying a pattern to be matched at the closing tag.
1467	If the pattern string is omitted (or C<undef>) then the closing
1468	tag is constructed by inserting a C</> after any leading bracket
1469	characters in the actual opening tag that was matched (I<not> the pattern
1470	that matched the tag). For example, if the opening tag pattern
1471	is specified as C<'{{\w+}}'> and actually matched the opening tag
1472	C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1473
1474	=item 4.
1475
1476	A string specifying a pattern to be matched as a prefix (which is to be
1477	skipped). If omitted, optional whitespace is skipped.
1478
1479	=item 5.
1480
1481	A hash reference containing various parsing options (see below)
1482
1483	=back
1484
1485	The various options that can be specified are:
1486
1487	=over 4
1488
1489	=item C<reject =E<gt> $listref>
1490
1491	The list reference contains one or more strings specifying patterns
1492	that must I<not> appear within the tagged text.
1493
1494	For example, to extract
1495	an HTML link (which should not contain nested links) use:
1496
1497	extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1498
1499	=item C<ignore =E<gt> $listref>
1500
1501	The list reference contains one or more strings specifying patterns
1502	that are I<not> be be treated as nested tags within the tagged text
1503	(even if they would match the start tag pattern).
1504
1505	For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1506
1507	extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1508
1509	(also see L<"gen_delimited_pat"> below).
1510
1511
1512	=item C<fail =E<gt> $str>
1513
1514	The C<fail> option indicates the action to be taken if a matching end
1515	tag is not encountered (i.e. before the end of the string or some
1516	C<reject> pattern matches). By default, a failure to match a closing
1517	tag causes C<extract_tagged> to immediately fail.
1518
1519	However, if the string value associated with <reject> is "MAX", then
1520	C<extract_tagged> returns the complete text up to the point of failure.
1521	If the string is "PARA", C<extract_tagged> returns only the first paragraph
1522	after the tag (up to the first line that is either empty or contains
1523	only whitespace characters).
1524	If the string is "", the the default behaviour (i.e. failure) is reinstated.
1525
1526	For example, suppose the start tag "/para" introduces a paragraph, which then
1527	continues until the next "/endpara" tag or until another "/para" tag is
1528	encountered:
1529
1530	$text = "/para line 1\n\nline 3\n/para line 4";
1531
1532	extract_tagged($text, '/para', '/endpara', undef,
1533	{reject => '/para', fail => MAX );
1534
1535	# EXTRACTED: "/para line 1\n\nline 3\n"
1536
1537	Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1538	tag refers only to the immediately following paragraph:
1539
1540	$text = "/para line 1\n\nline 3\n/para line 4";
1541
1542	extract_tagged($text, '/para', '/endpara', undef,
1543	{reject => '/para', fail => MAX );
1544
1545	# EXTRACTED: "/para line 1\n"
1546
1547	Note that the specified C<fail> behaviour applies to nested tags as well.
1548
1549	=back
1550
1551	On success in a list context, an array of 6 elements is returned. The elements are:
1552
1553	=over 4
1554
1555	=item [0]
1556
1557	the extracted tagged substring (including the outermost tags),
1558
1559	=item [1]
1560
1561	the remainder of the input text,
1562
1563	=item [2]
1564
1565	the prefix substring (if any),
1566
1567	=item [3]
1568
1569	the opening tag
1570
1571	=item [4]
1572
1573	the text between the opening and closing tags
1574
1575	=item [5]
1576
1577	the closing tag (or "" if no closing tag was found)
1578
1579	=back
1580
1581	On failure, all of these values (except the remaining text) are C<undef>.
1582
1583	In a scalar context, C<extract_tagged> returns just the complete
1584	substring that matched a tagged text (including the start and end
1585	tags). C<undef> is returned on failure. In addition, the original input
1586	text has the returned substring (and any prefix) removed from it.
1587
1588	In a void context, the input text just has the matched substring (and
1589	any specified prefix) removed.
1590
1591
1592	=head2 C<gen_extract_tagged>
1593
1594	(Note: This subroutine is only available under Perl5.005)
1595
1596	C<gen_extract_tagged> generates a new anonymous subroutine which
1597	extracts text between (balanced) specified tags. In other words,
1598	it generates a function identical in function to C<extract_tagged>.
1599
1600	The difference between C<extract_tagged> and the anonymous
1601	subroutines generated by
1602	C<gen_extract_tagged>, is that those generated subroutines:
1603
1604	=over 4
1605
1606	=item *
1607
1608	do not have to reparse tag specification or parsing options every time
1609	they are called (whereas C<extract_tagged> has to effectively rebuild
1610	its tag parser on every call);
1611
1612	=item *
1613
1614	make use of the new qr// construct to pre-compile the regexes they use
1615	(whereas C<extract_tagged> uses standard string variable interpolation
1616	to create tag-matching patterns).
1617
1618	=back
1619
1620	The subroutine takes up to four optional arguments (the same set as
1621	C<extract_tagged> except for the string to be processed). It returns
1622	a reference to a subroutine which in turn takes a single argument (the text to
1623	be extracted from).
1624
1625	In other words, the implementation of C<extract_tagged> is exactly
1626	equivalent to:
1627
1628	sub extract_tagged
1629	{
1630	my $text = shift;
1631	$extractor = gen_extract_tagged(@_);
1632	return $extractor->($text);
1633	}
1634
1635	(although C<extract_tagged> is not currently implemented that way, in order
1636	to preserve pre-5.005 compatibility).
1637
1638	Using C<gen_extract_tagged> to create extraction functions for specific tags
1639	is a good idea if those functions are going to be called more than once, since
1640	their performance is typically twice as good as the more general-purpose
1641	C<extract_tagged>.
1642
1643
1644	=head2 C<extract_quotelike>
1645
1646	C<extract_quotelike> attempts to recognize, extract, and segment any
1647	one of the various Perl quotes and quotelike operators (see
1648	L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1649	delimiters (for the quotelike operators), and trailing modifiers are
1650	all caught. For example, in:
1651
1652	extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1653
1654	extract_quotelike ' "You said, \"Use sed\"." '
1655
1656	extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1657
1658	extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1659
1660	the full Perl quotelike operations are all extracted correctly.
1661
1662	Note too that, when using the /x modifier on a regex, any comment
1663	containing the current pattern delimiter will cause the regex to be
1664	immediately terminated. In other words:
1665
1666	'm /
1667	(?i) # CASE INSENSITIVE
1668	[a-z_] # LEADING ALPHABETIC/UNDERSCORE
1669	[a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1670	/x'
1671
1672	will be extracted as if it were:
1673
1674	'm /
1675	(?i) # CASE INSENSITIVE
1676	[a-z_] # LEADING ALPHABETIC/'
1677
1678	This behaviour is identical to that of the actual compiler.
1679
1680	C<extract_quotelike> takes two arguments: the text to be processed and
1681	a prefix to be matched at the very beginning of the text. If no prefix
1682	is specified, optional whitespace is the default. If no text is given,
1683	C<$_> is used.
1684
1685	In a list context, an array of 11 elements is returned. The elements are:
1686
1687	=over 4
1688
1689	=item [0]
1690
1691	the extracted quotelike substring (including trailing modifiers),
1692
1693	=item [1]
1694
1695	the remainder of the input text,
1696
1697	=item [2]
1698
1699	the prefix substring (if any),
1700
1701	=item [3]
1702
1703	the name of the quotelike operator (if any),
1704
1705	=item [4]
1706
1707	the left delimiter of the first block of the operation,
1708
1709	=item [5]
1710
1711	the text of the first block of the operation
1712	(that is, the contents of
1713	a quote, the regex of a match or substitution or the target list of a
1714	translation),
1715
1716	=item [6]
1717
1718	the right delimiter of the first block of the operation,
1719
1720	=item [7]
1721
1722	the left delimiter of the second block of the operation
1723	(that is, if it is a C<s>, C<tr>, or C<y>),
1724
1725	=item [8]
1726
1727	the text of the second block of the operation
1728	(that is, the replacement of a substitution or the translation list
1729	of a translation),
1730
1731	=item [9]
1732
1733	the right delimiter of the second block of the operation (if any),
1734
1735	=item [10]
1736
1737	the trailing modifiers on the operation (if any).
1738
1739	=back
1740
1741	For each of the fields marked "(if any)" the default value on success is
1742	an empty string.
1743	On failure, all of these values (except the remaining text) are C<undef>.
1744
1745
1746	In a scalar context, C<extract_quotelike> returns just the complete substring
1747	that matched a quotelike operation (or C<undef> on failure). In a scalar or
1748	void context, the input text has the same substring (and any specified
1749	prefix) removed.
1750
1751	Examples:
1752
1753	# Remove the first quotelike literal that appears in text
1754
1755	$quotelike = extract_quotelike($text,'.*?');
1756
1757	# Replace one or more leading whitespace-separated quotelike
1758	# literals in $_ with "<QLL>"
1759
1760	do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1761
1762
1763	# Isolate the search pattern in a quotelike operation from $text
1764
1765	($op,$pat) = (extract_quotelike $text)[3,5];
1766	if ($op =~ /[ms]/)
1767	{
1768	print "search pattern: $pat\n";
1769	}
1770	else
1771	{
1772	print "$op is not a pattern matching operation\n";
1773	}
1774
1775
1776	=head2 C<extract_quotelike> and "here documents"
1777
1778	C<extract_quotelike> can successfully extract "here documents" from an input
1779	string, but with an important caveat in list contexts.
1780
1781	Unlike other types of quote-like literals, a here document is rarely
1782	a contiguous substring. For example, a typical piece of code using
1783	here document might look like this:
1784
1785	<<'EOMSG' \|\| die;
1786	This is the message.
1787	EOMSG
1788	exit;
1789
1790	Given this as an input string in a scalar context, C<extract_quotelike>
1791	would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1792	leaving the string " \|\| die;\nexit;" in the original variable. In other words,
1793	the two separate pieces of the here document are successfully extracted and
1794	concatenated.
1795
1796	In a list context, C<extract_quotelike> would return the list
1797
1798	=over 4
1799
1800	=item [0]
1801
1802	"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1803	including fore and aft delimiters),
1804
1805	=item [1]
1806
1807	" \|\| die;\nexit;" (i.e. the remainder of the input text, concatenated),
1808
1809	=item [2]
1810
1811	"" (i.e. the prefix substring -- trivial in this case),
1812
1813	=item [3]
1814
1815	"<<" (i.e. the "name" of the quotelike operator)
1816
1817	=item [4]
1818
1819	"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1820
1821	=item [5]
1822
1823	"This is the message.\n" (i.e. the text of the here document),
1824
1825	=item [6]
1826
1827	"EOMSG" (i.e. the right delimiter of the here document),
1828
1829	=item [7..10]
1830
1831	"" (a here document has no second left delimiter, second text, second right
1832	delimiter, or trailing modifiers).
1833
1834	=back
1835
1836	However, the matching position of the input variable would be set to
1837	"exit;" (i.e. I<after> the closing delimiter of the here document),
1838	which would cause the earlier " \|\| die;\nexit;" to be skipped in any
1839	sequence of code fragment extractions.
1840
1841	To avoid this problem, when it encounters a here document whilst
1842	extracting from a modifiable string, C<extract_quotelike> silently
1843	rearranges the string to an equivalent piece of Perl:
1844
1845	<<'EOMSG'
1846	This is the message.
1847	EOMSG
1848	\|\| die;
1849	exit;
1850
1851	in which the here document I<is> contiguous. It still leaves the
1852	matching position after the here document, but now the rest of the line
1853	on which the here document starts is not skipped.
1854
1855	To prevent <extract_quotelike> from mucking about with the input in this way
1856	(this is the only case where a list-context C<extract_quotelike> does so),
1857	you can pass the input variable as an interpolated literal:
1858
1859	$quotelike = extract_quotelike("$var");
1860
1861
1862	=head2 C<extract_codeblock>
1863
1864	C<extract_codeblock> attempts to recognize and extract a balanced
1865	bracket delimited substring that may contain unbalanced brackets
1866	inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1867	is like a combination of C<"extract_bracketed"> and
1868	C<"extract_quotelike">.
1869
1870	C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1871	a text to process, a set of delimiter brackets to look for, and a prefix to
1872	match first. It also takes an optional fourth parameter, which allows the
1873	outermost delimiter brackets to be specified separately (see below).
1874
1875	Omitting the first argument (input text) means process C<$_> instead.
1876	Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1877	Omitting the third argument (prefix argument) implies optional whitespace at the start.
1878	Omitting the fourth argument (outermost delimiter brackets) indicates that the
1879	value of the second argument is to be used for the outermost delimiters.
1880
1881	Once the prefix an dthe outermost opening delimiter bracket have been
1882	recognized, code blocks are extracted by stepping through the input text and
1883	trying the following alternatives in sequence:
1884
1885	=over 4
1886
1887	=item 1.
1888
1889	Try and match a closing delimiter bracket. If the bracket was the same
1890	species as the last opening bracket, return the substring to that
1891	point. If the bracket was mismatched, return an error.
1892
1893	=item 2.
1894
1895	Try to match a quote or quotelike operator. If found, call
1896	C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1897	the error it returned. Otherwise go back to step 1.
1898
1899	=item 3.
1900
1901	Try to match an opening delimiter bracket. If found, call
1902	C<extract_codeblock> recursively to eat the embedded block. If the
1903	recursive call fails, return an error. Otherwise, go back to step 1.
1904
1905	=item 4.
1906
1907	Unconditionally match a bareword or any other single character, and
1908	then go back to step 1.
1909
1910	=back
1911
1912
1913	Examples:
1914
1915	# Find a while loop in the text
1916
1917	if ($text =~ s/.?while\s\{/{/)
1918	{
1919	$loop = "while " . extract_codeblock($text);
1920	}
1921
1922	# Remove the first round-bracketed list (which may include
1923	# round- or curly-bracketed code blocks or quotelike operators)
1924
1925	extract_codeblock $text, "(){}", '[^(]*';
1926
1927
1928	The ability to specify a different outermost delimiter bracket is useful
1929	in some circumstances. For example, in the Parse::RecDescent module,
1930	parser actions which are to be performed only on a successful parse
1931	are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1932
1933	sentence: subject verb object
1934	<defer: {$::theVerb = $item{verb}} >
1935
1936	Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1937	within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1938
1939	A deferred action like this:
1940
1941	<defer: {if ($count>10) {$count--}} >
1942
1943	will be incorrectly parsed as:
1944
1945	<defer: {if ($count>
1946
1947	because the "less than" operator is interpreted as a closing delimiter.
1948
1949	But, by extracting the directive using
1950	S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1951	the '>' character is only treated as a delimited at the outermost
1952	level of the code block, so the directive is parsed correctly.
1953
1954	=head2 C<extract_multiple>
1955
1956	The C<extract_multiple> subroutine takes a string to be processed and a
1957	list of extractors (subroutines or regular expressions) to apply to that string.
1958
1959	In an array context C<extract_multiple> returns an array of substrings
1960	of the original string, as extracted by the specified extractors.
1961	In a scalar context, C<extract_multiple> returns the first
1962	substring successfully extracted from the original string. In both
1963	scalar and void contexts the original string has the first successfully
1964	extracted substring removed from it. In all contexts
1965	C<extract_multiple> starts at the current C<pos> of the string, and
1966	sets that C<pos> appropriately after it matches.
1967
1968	Hence, the aim of of a call to C<extract_multiple> in a list context
1969	is to split the processed string into as many non-overlapping fields as
1970	possible, by repeatedly applying each of the specified extractors
1971	to the remainder of the string. Thus C<extract_multiple> is
1972	a generalized form of Perl's C<split> subroutine.
1973
1974	The subroutine takes up to four optional arguments:
1975
1976	=over 4
1977
1978	=item 1.
1979
1980	A string to be processed (C<$_> if the string is omitted or C<undef>)
1981
1982	=item 2.
1983
1984	A reference to a list of subroutine references and/or qr// objects and/or
1985	literal strings and/or hash references, specifying the extractors
1986	to be used to split the string. If this argument is omitted (or
1987	C<undef>) the list:
1988
1989	[
1990	sub { extract_variable($_[0], '') },
1991	sub { extract_quotelike($_[0],'') },
1992	sub { extract_codeblock($_[0],'{}','') },
1993	]
1994
1995	is used.
1996
1997
1998	=item 3.
1999
2000	An number specifying the maximum number of fields to return. If this
2001	argument is omitted (or C<undef>), split continues as long as possible.
2002
2003	If the third argument is I<N>, then extraction continues until I<N> fields
2004	have been successfully extracted, or until the string has been completely
2005	processed.
2006
2007	Note that in scalar and void contexts the value of this argument is
2008	automatically reset to 1 (under C<-w>, a warning is issued if the argument
2009	has to be reset).
2010
2011	=item 4.
2012
2013	A value indicating whether unmatched substrings (see below) within the
2014	text should be skipped or returned as fields. If the value is true,
2015	such substrings are skipped. Otherwise, they are returned.
2016
2017	=back
2018
2019	The extraction process works by applying each extractor in
2020	sequence to the text string.
2021
2022	If the extractor is a subroutine it is called in a list context and is
2023	expected to return a list of a single element, namely the extracted
2024	text. It may optionally also return two further arguments: a string
2025	representing the text left after extraction (like $' for a pattern
2026	match), and a string representing any prefix skipped before the
2027	extraction (like $` in a pattern match). Note that this is designed
2028	to facilitate the use of other Text::Balanced subroutines with
2029	C<extract_multiple>. Note too that the value returned by an extractor
2030	subroutine need not bear any relationship to the corresponding substring
2031	of the original text (see examples below).
2032
2033	If the extractor is a precompiled regular expression or a string,
2034	it is matched against the text in a scalar context with a leading
2035	'\G' and the gc modifiers enabled. The extracted value is either
2036	$1 if that variable is defined after the match, or else the
2037	complete match (i.e. $&).
2038
2039	If the extractor is a hash reference, it must contain exactly one element.
2040	The value of that element is one of the
2041	above extractor types (subroutine reference, regular expression, or string).
2042	The key of that element is the name of a class into which the successful
2043	return value of the extractor will be blessed.
2044
2045	If an extractor returns a defined value, that value is immediately
2046	treated as the next extracted field and pushed onto the list of fields.
2047	If the extractor was specified in a hash reference, the field is also
2048	blessed into the appropriate class,
2049
2050	If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2051	assumed to have failed to extract.
2052	If none of the extractor subroutines succeeds, then one
2053	character is extracted from the start of the text and the extraction
2054	subroutines reapplied. Characters which are thus removed are accumulated and
2055	eventually become the next field (unless the fourth argument is true, in which
2056	case they are disgarded).
2057
2058	For example, the following extracts substrings that are valid Perl variables:
2059
2060	@fields = extract_multiple($text,
2061	[ sub { extract_variable($_[0]) } ],
2062	undef, 1);
2063
2064	This example separates a text into fields which are quote delimited,
2065	curly bracketed, and anything else. The delimited and bracketed
2066	parts are also blessed to identify them (the "anything else" is unblessed):
2067
2068	@fields = extract_multiple($text,
2069	[
2070	{ Delim => sub { extract_delimited($_[0],q{'"}) } },
2071	{ Brack => sub { extract_bracketed($_[0],'{}') } },
2072	]);
2073
2074	This call extracts the next single substring that is a valid Perl quotelike
2075	operator (and removes it from $text):
2076
2077	$quotelike = extract_multiple($text,
2078	[
2079	sub { extract_quotelike($_[0]) },
2080	], undef, 1);
2081
2082	Finally, here is yet another way to do comma-separated value parsing:
2083
2084	@fields = extract_multiple($csv_text,
2085	[
2086	sub { extract_delimited($_[0],q{'"}) },
2087	qr/([^,]+)(.*)/,
2088	],
2089	undef,1);
2090
2091	The list in the second argument means:
2092	I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2093	The undef third argument means:
2094	I<"...as many times as possible...">,
2095	and the true value in the fourth argument means
2096	I<"...discarding anything else that appears (i.e. the commas)">.
2097
2098	If you wanted the commas preserved as separate fields (i.e. like split
2099	does if your split pattern has capturing parentheses), you would
2100	just make the last parameter undefined (or remove it).
2101
2102
2103	=head2 C<gen_delimited_pat>
2104
2105	The C<gen_delimited_pat> subroutine takes a single (string) argument and
2106	> builds a Friedl-style optimized regex that matches a string delimited
2107	by any one of the characters in the single argument. For example:
2108
2109	gen_delimited_pat(q{'"})
2110
2111	returns the regex:
2112
2113	(?:\"(?:\\\"\|(?!\").)\"\|\'(?:\\\'\|(?!\').)\')
2114
2115	Note that the specified delimiters are automatically quotemeta'd.
2116
2117	A typical use of C<gen_delimited_pat> would be to build special purpose tags
2118	for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2119	(which might contain quoted strings):
2120
2121	my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '\|.)+/>';
2122
2123	extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2124
2125
2126	C<gen_delimited_pat> may also be called with an optional second argument,
2127	which specifies the "escape" character(s) to be used for each delimiter.
2128	For example to match a Pascal-style string (where ' is the delimiter
2129	and '' is a literal ' within the string):
2130
2131	gen_delimited_pat(q{'},q{'});
2132
2133	Different escape characters can be specified for different delimiters.
2134	For example, to specify that '/' is the escape for single quotes
2135	and '%' is the escape for double quotes:
2136
2137	gen_delimited_pat(q{'"},q{/%});
2138
2139	If more delimiters than escape chars are specified, the last escape char
2140	is used for the remaining delimiters.
2141	If no escape char is specified for a given specified delimiter, '\' is used.
2142
2143	Note that
2144	C<gen_delimited_pat> was previously called
2145	C<delimited_pat>. That name may still be used, but is now deprecated.
2146
2147
2148	=head1 DIAGNOSTICS
2149
2150	In a list context, all the functions return C<(undef,$original_text)>
2151	on failure. In a scalar context, failure is indicated by returning C<undef>
2152	(in this case the input text is not modified in any way).
2153
2154	In addition, on failure in I<any> context, the C<$@> variable is set.
2155	Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2156	below.
2157	Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2158	which the error was detected (although not necessarily where it occurred!)
2159	Printing C<$@> directly produces the error message, with the offset appended.
2160	On success, the C<$@> variable is guaranteed to be C<undef>.
2161
2162	The available diagnostics are:
2163
2164	=over 4
2165
2166	=item C<Did not find a suitable bracket: "%s">
2167
2168	The delimiter provided to C<extract_bracketed> was not one of
2169	C<'()[]E<lt>E<gt>{}'>.
2170
2171	=item C<Did not find prefix: /%s/>
2172
2173	A non-optional prefix was specified but wasn't found at the start of the text.
2174
2175	=item C<Did not find opening bracket after prefix: "%s">
2176
2177	C<extract_bracketed> or C<extract_codeblock> was expecting a
2178	particular kind of bracket at the start of the text, and didn't find it.
2179
2180	=item C<No quotelike operator found after prefix: "%s">
2181
2182	C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2183	C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2184	it was extracting.
2185
2186	=item C<Unmatched closing bracket: "%c">
2187
2188	C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2189	a closing bracket where none was expected.
2190
2191	=item C<Unmatched opening bracket(s): "%s">
2192
2193	C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2194	out of characters in the text before closing one or more levels of nested
2195	brackets.
2196
2197	=item C<Unmatched embedded quote (%s)>
2198
2199	C<extract_bracketed> attempted to match an embedded quoted substring, but
2200	failed to find a closing quote to match it.
2201
2202	=item C<Did not find closing delimiter to match '%s'>
2203
2204	C<extract_quotelike> was unable to find a closing delimiter to match the
2205	one that opened the quote-like operation.
2206
2207	=item C<Mismatched closing bracket: expected "%c" but found "%s">
2208
2209	C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2210	a valid bracket delimiter, but it was the wrong species. This usually
2211	indicates a nesting error, but may indicate incorrect quoting or escaping.
2212
2213	=item C<No block delimiter found after quotelike "%s">
2214
2215	C<extract_quotelike> or C<extract_codeblock> found one of the
2216	quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2217	without a suitable block after it.
2218
2219	=item C<Did not find leading dereferencer>
2220
2221	C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2222	a variable, but didn't find any of them.
2223
2224	=item C<Bad identifier after dereferencer>
2225
2226	C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2227	character was not followed by a legal Perl identifier.
2228
2229	=item C<Did not find expected opening bracket at %s>
2230
2231	C<extract_codeblock> failed to find any of the outermost opening brackets
2232	that were specified.
2233
2234	=item C<Improperly nested codeblock at %s>
2235
2236	A nested code block was found that started with a delimiter that was specified
2237	as being only to be used as an outermost bracket.
2238
2239	=item C<Missing second block for quotelike "%s">
2240
2241	C<extract_codeblock> or C<extract_quotelike> found one of the
2242	quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2243
2244	=item C<No match found for opening bracket>
2245
2246	C<extract_codeblock> failed to find a closing bracket to match the outermost
2247	opening bracket.
2248
2249	=item C<Did not find opening tag: /%s/>
2250
2251	C<extract_tagged> did not find a suitable opening tag (after any specified
2252	prefix was removed).
2253
2254	=item C<Unable to construct closing tag to match: /%s/>
2255
2256	C<extract_tagged> matched the specified opening tag and tried to
2257	modify the matched text to produce a matching closing tag (because
2258	none was specified). It failed to generate the closing tag, almost
2259	certainly because the opening tag did not start with a
2260	bracket of some kind.
2261
2262	=item C<Found invalid nested tag: %s>
2263
2264	C<extract_tagged> found a nested tag that appeared in the "reject" list
2265	(and the failure mode was not "MAX" or "PARA").
2266
2267	=item C<Found unbalanced nested tag: %s>
2268
2269	C<extract_tagged> found a nested opening tag that was not matched by a
2270	corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2271
2272	=item C<Did not find closing tag>
2273
2274	C<extract_tagged> reached the end of the text without finding a closing tag
2275	to match the original opening tag (and the failure mode was not
2276	"MAX" or "PARA").
2277
2278
2279
2280
2281	=back
2282
2283
2284	=head1 AUTHOR
2285
2286	Damian Conway ([email protected])
2287
2288
2289	=head1 BUGS AND IRRITATIONS
2290
2291	There are undoubtedly serious bugs lurking somewhere in this code, if
2292	only because parts of it give the impression of understanding a great deal
2293	more about Perl than they really do.
2294
2295	Bug reports and other feedback are most welcome.
2296
2297
2298	=head1 COPYRIGHT
2299
2300	Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2301	This module is free software. It may be used, redistributed
2302	and/or modified under the same terms as Perl itself.

Note: See TracBrowser for help on using the repository browser.

Download in other formats: