source: for-distributions/trunk/bin/windows/perl/lib/Text/ParseWords.pm@ 14489

Last change on this file since 14489 was 14489, checked in by oranfry, 17 years ago

upgrading to perl 5.8

File size: 6.5 KB
Line 
1package Text::ParseWords;
2
3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
4$VERSION = "3.24";
5
6require 5.000;
7
8use Exporter;
9@ISA = qw(Exporter);
10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11@EXPORT_OK = qw(old_shellwords);
12
13
14sub shellwords {
15 my(@lines) = @_;
16 $lines[$#lines] =~ s/\s+$//;
17 return(quotewords('\s+', 0, @lines));
18}
19
20
21
22sub quotewords {
23 my($delim, $keep, @lines) = @_;
24 my($line, @words, @allwords);
25
26 foreach $line (@lines) {
27 @words = parse_line($delim, $keep, $line);
28 return() unless (@words || !length($line));
29 push(@allwords, @words);
30 }
31 return(@allwords);
32}
33
34
35
36sub nested_quotewords {
37 my($delim, $keep, @lines) = @_;
38 my($i, @allwords);
39
40 for ($i = 0; $i < @lines; $i++) {
41 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
42 return() unless (@{$allwords[$i]} || !length($lines[$i]));
43 }
44 return(@allwords);
45}
46
47
48
49sub parse_line {
50 my($delimiter, $keep, $line) = @_;
51 my($word, @pieces);
52
53 no warnings 'uninitialized'; # we will be testing undef strings
54
55 while (length($line)) {
56 $line =~ s/^(["']) # a $quote
57 ((?:\\.|(?!\1)[^\\])*) # and $quoted text
58 \1 # followed by the same quote
59 | # --OR--
60 ^((?:\\.|[^\\"'])*?) # an $unquoted text
61 (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
62 # plus EOL, delimiter, or quote
63 //xs or return; # extended layout
64 my($quote, $quoted, $unquoted, $delim) = ($1, $2, $3, $4);
65 return() unless( defined($quote) || length($unquoted) || length($delim));
66
67 if ($keep) {
68 $quoted = "$quote$quoted$quote";
69 }
70 else {
71 $unquoted =~ s/\\(.)/$1/sg;
72 if (defined $quote) {
73 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
74 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
75 }
76 }
77 $word .= substr($line, 0, 0); # leave results tainted
78 $word .= defined $quote ? $quoted : $unquoted;
79
80 if (length($delim)) {
81 push(@pieces, $word);
82 push(@pieces, $delim) if ($keep eq 'delimiters');
83 undef $word;
84 }
85 if (!length($line)) {
86 push(@pieces, $word);
87 }
88 }
89 return(@pieces);
90}
91
92
93
94sub old_shellwords {
95
96 # Usage:
97 # use ParseWords;
98 # @words = old_shellwords($line);
99 # or
100 # @words = old_shellwords(@lines);
101 # or
102 # @words = old_shellwords(); # defaults to $_ (and clobbers it)
103
104 no warnings 'uninitialized'; # we will be testing undef strings
105 local *_ = \join('', @_) if @_;
106 my (@words, $snippet);
107
108 s/\A\s+//;
109 while ($_ ne '') {
110 my $field = substr($_, 0, 0); # leave results tainted
111 for (;;) {
112 if (s/\A"(([^"\\]|\\.)*)"//s) {
113 ($snippet = $1) =~ s#\\(.)#$1#sg;
114 }
115 elsif (/\A"/) {
116 require Carp;
117 Carp::carp("Unmatched double quote: $_");
118 return();
119 }
120 elsif (s/\A'(([^'\\]|\\.)*)'//s) {
121 ($snippet = $1) =~ s#\\(.)#$1#sg;
122 }
123 elsif (/\A'/) {
124 require Carp;
125 Carp::carp("Unmatched single quote: $_");
126 return();
127 }
128 elsif (s/\A\\(.)//s) {
129 $snippet = $1;
130 }
131 elsif (s/\A([^\s\\'"]+)//) {
132 $snippet = $1;
133 }
134 else {
135 s/\A\s+//;
136 last;
137 }
138 $field .= $snippet;
139 }
140 push(@words, $field);
141 }
142 return @words;
143}
144
1451;
146
147__END__
148
149=head1 NAME
150
151Text::ParseWords - parse text into an array of tokens or array of arrays
152
153=head1 SYNOPSIS
154
155 use Text::ParseWords;
156 @lists = &nested_quotewords($delim, $keep, @lines);
157 @words = &quotewords($delim, $keep, @lines);
158 @words = &shellwords(@lines);
159 @words = &parse_line($delim, $keep, $line);
160 @words = &old_shellwords(@lines); # DEPRECATED!
161
162=head1 DESCRIPTION
163
164The &nested_quotewords() and &quotewords() functions accept a delimiter
165(which can be a regular expression)
166and a list of lines and then breaks those lines up into a list of
167words ignoring delimiters that appear inside quotes. &quotewords()
168returns all of the tokens in a single long list, while &nested_quotewords()
169returns a list of token lists corresponding to the elements of @lines.
170&parse_line() does tokenizing on a single string. The &*quotewords()
171functions simply call &parse_line(), so if you're only splitting
172one line you can call &parse_line() directly and save a function
173call.
174
175The $keep argument is a boolean flag. If true, then the tokens are
176split on the specified delimiter, but all other characters (quotes,
177backslashes, etc.) are kept in the tokens. If $keep is false then the
178&*quotewords() functions remove all quotes and backslashes that are
179not themselves backslash-escaped or inside of single quotes (i.e.,
180&quotewords() tries to interpret these characters just like the Bourne
181shell). NB: these semantics are significantly different from the
182original version of this module shipped with Perl 5.000 through 5.004.
183As an additional feature, $keep may be the keyword "delimiters" which
184causes the functions to preserve the delimiters in each string as
185tokens in the token lists, in addition to preserving quote and
186backslash characters.
187
188&shellwords() is written as a special case of &quotewords(), and it
189does token parsing with whitespace as a delimiter-- similar to most
190Unix shells.
191
192=head1 EXAMPLES
193
194The sample program:
195
196 use Text::ParseWords;
197 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
198 $i = 0;
199 foreach (@words) {
200 print "$i: <$_>\n";
201 $i++;
202 }
203
204produces:
205
206 0: <this>
207 1: <is>
208 2: <a test>
209 3: <of quotewords>
210 4: <"for>
211 5: <you>
212
213demonstrating:
214
215=over 4
216
217=item 0
218
219a simple word
220
221=item 1
222
223multiple spaces are skipped because of our $delim
224
225=item 2
226
227use of quotes to include a space in a word
228
229=item 3
230
231use of a backslash to include a space in a word
232
233=item 4
234
235use of a backslash to remove the special meaning of a double-quote
236
237=item 5
238
239another simple word (note the lack of effect of the
240backslashed double-quote)
241
242=back
243
244Replacing C<&quotewords('\s+', 0, q{this is...})>
245with C<&shellwords(q{this is...})>
246is a simpler way to accomplish the same thing.
247
248=head1 AUTHORS
249
250Maintainer is Hal Pomeranz <[email protected]>, 1994-1997 (Original
251author unknown). Much of the code for &parse_line() (including the
252primary regexp) from Joerk Behrends <[email protected]>.
253
254Examples section another documentation provided by John Heidemann
255<[email protected]>
256
257Bug reports, patches, and nagging provided by lots of folks-- thanks
258everybody! Special thanks to Michael Schwern <[email protected]>
259for assuring me that a &nested_quotewords() would be useful, and to
260Jeff Friedl <[email protected]> for telling me not to worry about
261error-checking (sort of-- you had to be there).
262
263=cut
Note: See TracBrowser for help on using the repository browser.