1 | #if !defined(RXH) || defined(RX_WANT_SE_DEFS)
|
---|
2 | #define RXH
|
---|
3 |
|
---|
4 | /* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
|
---|
5 |
|
---|
6 | This file is part of the librx library.
|
---|
7 |
|
---|
8 | Librx is free software; you can redistribute it and/or modify it under
|
---|
9 | the terms of the GNU Library General Public License as published by
|
---|
10 | the Free Software Foundation; either version 2, or (at your option)
|
---|
11 | any later version.
|
---|
12 |
|
---|
13 | Librx is distributed in the hope that it will be useful, but WITHOUT
|
---|
14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
---|
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
---|
16 | for more details.
|
---|
17 |
|
---|
18 | You should have received a copy of the GNU Library General Public
|
---|
19 | License along with this software; see the file COPYING.LIB. If not,
|
---|
20 | write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
|
---|
21 | 02139, USA. */
|
---|
22 | /* t. lord Wed Sep 23 18:20:57 1992 */
|
---|
23 |
|
---|
24 |
|
---|
25 | |
---|
26 |
|
---|
27 |
|
---|
28 |
|
---|
29 |
|
---|
30 |
|
---|
31 |
|
---|
32 | #ifndef RX_WANT_SE_DEFS
|
---|
33 |
|
---|
34 | /* This page: Bitsets */
|
---|
35 |
|
---|
36 | #ifndef RX_subset
|
---|
37 | typedef unsigned int RX_subset;
|
---|
38 | #define RX_subset_bits (32)
|
---|
39 | #define RX_subset_mask (RX_subset_bits - 1)
|
---|
40 | #endif
|
---|
41 |
|
---|
42 | typedef RX_subset * rx_Bitset;
|
---|
43 |
|
---|
44 | #ifdef __STDC__
|
---|
45 | typedef void (*rx_bitset_iterator) (void *, int member_index);
|
---|
46 | #else
|
---|
47 | typedef void (*rx_bitset_iterator) ();
|
---|
48 | #endif
|
---|
49 |
|
---|
50 | #define rx_bitset_subset(N) ((N) / RX_subset_bits)
|
---|
51 | #define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)])
|
---|
52 | #define RX_bitset_access(B,N,OP) \
|
---|
53 | ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
|
---|
54 | #define RX_bitset_member(B,N) RX_bitset_access(B, N, &)
|
---|
55 | #define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=)
|
---|
56 | #define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~)
|
---|
57 | #define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= )
|
---|
58 | #define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
|
---|
59 | #define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset))
|
---|
60 |
|
---|
61 | |
---|
62 |
|
---|
63 |
|
---|
64 | /* This page: Splay trees. */
|
---|
65 |
|
---|
66 | #ifdef __STDC__
|
---|
67 | typedef int (*rx_sp_comparer) (void * a, void * b);
|
---|
68 | #else
|
---|
69 | typedef int (*rx_sp_comparer) ();
|
---|
70 | #endif
|
---|
71 |
|
---|
72 | struct rx_sp_node
|
---|
73 | {
|
---|
74 | void * key;
|
---|
75 | void * data;
|
---|
76 | struct rx_sp_node * kids[2];
|
---|
77 | };
|
---|
78 |
|
---|
79 | #ifdef __STDC__
|
---|
80 | typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
|
---|
81 | #else
|
---|
82 | typedef void (*rx_sp_key_data_freer) ();
|
---|
83 | #endif
|
---|
84 |
|
---|
85 | |
---|
86 |
|
---|
87 | /* giant inflatable hash trees */
|
---|
88 |
|
---|
89 | struct rx_hash_item
|
---|
90 | {
|
---|
91 | struct rx_hash_item * next_same_hash;
|
---|
92 | struct rx_hash * table;
|
---|
93 | unsigned long hash;
|
---|
94 | void * data;
|
---|
95 | void * binding;
|
---|
96 | };
|
---|
97 |
|
---|
98 | struct rx_hash
|
---|
99 | {
|
---|
100 | struct rx_hash * parent;
|
---|
101 | int refs;
|
---|
102 | struct rx_hash * children[13];
|
---|
103 | struct rx_hash_item * buckets [13];
|
---|
104 | int bucket_size [13];
|
---|
105 | };
|
---|
106 |
|
---|
107 | struct rx_hash_rules;
|
---|
108 |
|
---|
109 | #ifdef __STDC__
|
---|
110 | /* should return like == */
|
---|
111 | typedef int (*rx_hash_eq)(void *, void *);
|
---|
112 | typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
|
---|
113 | typedef void (*rx_free_hash)(struct rx_hash *,
|
---|
114 | struct rx_hash_rules *);
|
---|
115 | typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
|
---|
116 | void *);
|
---|
117 | typedef void (*rx_free_hash_item)(struct rx_hash_item *,
|
---|
118 | struct rx_hash_rules *);
|
---|
119 | #else
|
---|
120 | typedef int (*rx_hash_eq)();
|
---|
121 | typedef struct rx_hash * (*rx_alloc_hash)();
|
---|
122 | typedef void (*rx_free_hash)();
|
---|
123 | typedef struct rx_hash_item * (*rx_alloc_hash_item)();
|
---|
124 | typedef void (*rx_free_hash_item)();
|
---|
125 | #endif
|
---|
126 |
|
---|
127 | struct rx_hash_rules
|
---|
128 | {
|
---|
129 | rx_hash_eq eq;
|
---|
130 | rx_alloc_hash hash_alloc;
|
---|
131 | rx_free_hash free_hash;
|
---|
132 | rx_alloc_hash_item hash_item_alloc;
|
---|
133 | rx_free_hash_item free_hash_item;
|
---|
134 | };
|
---|
135 |
|
---|
136 | |
---|
137 |
|
---|
138 | /* Forward declarations */
|
---|
139 |
|
---|
140 | struct rx_cache;
|
---|
141 | struct rx_superset;
|
---|
142 | struct rx;
|
---|
143 | struct rx_se_list;
|
---|
144 |
|
---|
145 | |
---|
146 |
|
---|
147 |
|
---|
148 | /*
|
---|
149 | * GLOSSARY
|
---|
150 | *
|
---|
151 | * regexp
|
---|
152 | * regular expression
|
---|
153 | * expression
|
---|
154 | * pattern - a `regular' expression. The expression
|
---|
155 | * need not be formally regular -- it can contain
|
---|
156 | * constructs that don't correspond to purely regular
|
---|
157 | * expressions.
|
---|
158 | *
|
---|
159 | * buffer
|
---|
160 | * string - the string (or strings) being searched or matched.
|
---|
161 | *
|
---|
162 | * pattern buffer - a structure of type `struct re_pattern_buffer'
|
---|
163 | * This in turn contains a `struct rx', which holds the
|
---|
164 | * NFA compiled from a pattern, as well as some of the state
|
---|
165 | * of a matcher using the pattern.
|
---|
166 | *
|
---|
167 | * NFA - nondeterministic finite automata. Some people
|
---|
168 | * use this term to a member of the class of
|
---|
169 | * regular automata (those corresponding to a regular
|
---|
170 | * language). However, in this code, the meaning is
|
---|
171 | * more general. The automata used by Rx are comperable
|
---|
172 | * in power to what are usually called `push down automata'.
|
---|
173 | *
|
---|
174 | * Two NFA are built by rx for every pattern. One is built
|
---|
175 | * by the compiler. The other is built from the first, on
|
---|
176 | * the fly, by the matcher. The latter is called the `superstate
|
---|
177 | * NFA' because its states correspond to sets of states from
|
---|
178 | * the first NFA. (Joe Keane gets credit for the name
|
---|
179 | * `superstate NFA').
|
---|
180 | *
|
---|
181 | * NFA edges
|
---|
182 | * epsilon edges
|
---|
183 | * side-effect edges - The NFA compiled from a pattern can have three
|
---|
184 | * kinds of edges. Epsilon edges can be taken freely anytime
|
---|
185 | * their source state is reached. Character set edges can be
|
---|
186 | * taken when their source state is reached and when the next
|
---|
187 | * character in the buffer is a member of the set. Side effect
|
---|
188 | * edges imply a transition that can only be taken after the
|
---|
189 | * indicated side effect has been successfully accomplished.
|
---|
190 | * Some examples of side effects are:
|
---|
191 | *
|
---|
192 | * Storing the current match position to record the
|
---|
193 | * location of a parentesized subexpression.
|
---|
194 | *
|
---|
195 | * Advancing the matcher over N characters if they
|
---|
196 | * match the N characters previously matched by a
|
---|
197 | * parentesized subexpression.
|
---|
198 | *
|
---|
199 | * Both of those kinds of edges occur in the NFA generated
|
---|
200 | * by the pattern: \(.\)\1
|
---|
201 | *
|
---|
202 | * Epsilon and side effect edges are similar. Unfortunately,
|
---|
203 | * some of the code uses the name `epsilon edge' to mean
|
---|
204 | * both epsilon and side effect edges. For example, the
|
---|
205 | * function has_non_idempotent_epsilon_path computes the existance
|
---|
206 | * of a non-trivial path containing only a mix of epsilon and
|
---|
207 | * side effect edges. In that case `nonidempotent epsilon' is being
|
---|
208 | * used to mean `side effect'.
|
---|
209 | */
|
---|
210 |
|
---|
211 |
|
---|
212 |
|
---|
213 | |
---|
214 |
|
---|
215 |
|
---|
216 | /* LOW LEVEL PATTERN BUFFERS */
|
---|
217 |
|
---|
218 | /* Suppose that from some NFA state, more than one path through
|
---|
219 | * side-effect edges is possible. In what order should the paths
|
---|
220 | * be tried? A function of type rx_se_list_order answers that
|
---|
221 | * question. It compares two lists of side effects, and says
|
---|
222 | * which list comes first.
|
---|
223 | */
|
---|
224 |
|
---|
225 | #ifdef __STDC__
|
---|
226 | typedef int (*rx_se_list_order) (struct rx *,
|
---|
227 | struct rx_se_list *,
|
---|
228 | struct rx_se_list *);
|
---|
229 | #else
|
---|
230 | typedef int (*rx_se_list_order) ();
|
---|
231 | #endif
|
---|
232 |
|
---|
233 |
|
---|
234 |
|
---|
235 | /* Struct RX holds a compiled regular expression - that is, an nfa
|
---|
236 | * ready to be converted on demand to a more efficient superstate nfa.
|
---|
237 | * This is for the low level interface. The high-level interfaces enclose
|
---|
238 | * this in a `struct re_pattern_buffer'.
|
---|
239 | */
|
---|
240 | struct rx
|
---|
241 | {
|
---|
242 | /* The compiler assigns a unique id to every pattern.
|
---|
243 | * Like sequence numbers in X, there is a subtle bug here
|
---|
244 | * if you use Rx in a system that runs for a long time.
|
---|
245 | * But, because of the way the caches work out, it is almost
|
---|
246 | * impossible to trigger the Rx version of this bug.
|
---|
247 | *
|
---|
248 | * The id is used to validate superstates found in a cache
|
---|
249 | * of superstates. It isn't sufficient to let a superstate
|
---|
250 | * point back to the rx for which it was compiled -- the caller
|
---|
251 | * may be re-using a `struct rx' in which case the superstate
|
---|
252 | * is not really valid. So instead, superstates are validated
|
---|
253 | * by checking the sequence number of the pattern for which
|
---|
254 | * they were built.
|
---|
255 | */
|
---|
256 | int rx_id;
|
---|
257 |
|
---|
258 | /* This is memory mgt. state for superstates. This may be
|
---|
259 | * shared by more than one struct rx.
|
---|
260 | */
|
---|
261 | struct rx_cache * cache;
|
---|
262 |
|
---|
263 | /* Every regex defines the size of its own character set.
|
---|
264 | * A superstate has an array of this size, with each element
|
---|
265 | * a `struct rx_inx'. So, don't make this number too large.
|
---|
266 | * In particular, don't make it 2^16.
|
---|
267 | */
|
---|
268 | int local_cset_size;
|
---|
269 |
|
---|
270 | /* After the NFA is built, it is copied into a contiguous region
|
---|
271 | * of memory (mostly for compatability with GNU regex).
|
---|
272 | * Here is that region, and it's size:
|
---|
273 | */
|
---|
274 | void * buffer;
|
---|
275 | unsigned long allocated;
|
---|
276 |
|
---|
277 | /* Clients of RX can ask for some extra storage in the space pointed
|
---|
278 | * to by BUFFER. The field RESERVED is an input parameter to the
|
---|
279 | * compiler. After compilation, this much space will be available
|
---|
280 | * at (buffer + allocated - reserved)
|
---|
281 | */
|
---|
282 | unsigned long reserved;
|
---|
283 |
|
---|
284 | /* --------- The remaining fields are for internal use only. --------- */
|
---|
285 | /* --------- But! they must be initialized to 0. --------- */
|
---|
286 |
|
---|
287 | /* NODEC is the number of nodes in the NFA with non-epsilon
|
---|
288 | * transitions.
|
---|
289 | */
|
---|
290 | int nodec;
|
---|
291 |
|
---|
292 | /* EPSNODEC is the number of nodes with only epsilon transitions. */
|
---|
293 | int epsnodec;
|
---|
294 |
|
---|
295 | /* The sum (NODEC + EPSNODEC) is the total number of states in the
|
---|
296 | * compiled NFA.
|
---|
297 | */
|
---|
298 |
|
---|
299 | /* Lists of side effects as stored in the NFA are `hash consed'..meaning
|
---|
300 | * that lists with the same elements are ==. During compilation,
|
---|
301 | * this table facilitates hash-consing.
|
---|
302 | */
|
---|
303 | struct rx_hash se_list_memo;
|
---|
304 |
|
---|
305 | /* Lists of NFA states are also hashed.
|
---|
306 | */
|
---|
307 | struct rx_hash set_list_memo;
|
---|
308 |
|
---|
309 |
|
---|
310 |
|
---|
311 |
|
---|
312 | /* The compiler and matcher must build a number of instruction frames.
|
---|
313 | * The format of these frames is fixed (c.f. struct rx_inx). The values
|
---|
314 | * of the instructions is not fixed.
|
---|
315 | *
|
---|
316 | * An enumerated type (enum rx_opcode) defines the set of instructions
|
---|
317 | * that the compiler or matcher might generate. When filling an instruction
|
---|
318 | * frame, the INX field is found by indexing this instruction table
|
---|
319 | * with an opcode:
|
---|
320 | */
|
---|
321 | void ** instruction_table;
|
---|
322 |
|
---|
323 | /* The list of all states in an NFA.
|
---|
324 | * During compilation, the NEXT field of NFA states links this list.
|
---|
325 | * After compilation, all the states are compacted into an array,
|
---|
326 | * ordered by state id numbers. At that time, this points to the base
|
---|
327 | * of that array.
|
---|
328 | */
|
---|
329 | struct rx_nfa_state *nfa_states;
|
---|
330 |
|
---|
331 | /* Every nfa begins with one distinguished starting state:
|
---|
332 | */
|
---|
333 | struct rx_nfa_state *start;
|
---|
334 |
|
---|
335 | /* This orders the search through super-nfa paths.
|
---|
336 | * See the comment near the typedef of rx_se_list_order.
|
---|
337 | */
|
---|
338 | rx_se_list_order se_list_cmp;
|
---|
339 |
|
---|
340 | struct rx_superset * start_set;
|
---|
341 | };
|
---|
342 |
|
---|
343 |
|
---|
344 |
|
---|
345 | |
---|
346 |
|
---|
347 | /* SYNTAX TREES */
|
---|
348 |
|
---|
349 | /* Compilation is in stages.
|
---|
350 | *
|
---|
351 | * In the first stage, a pattern specified by a string is
|
---|
352 | * translated into a syntax tree. Later stages will convert
|
---|
353 | * the syntax tree into an NFA optimized for conversion to a
|
---|
354 | * superstate-NFA.
|
---|
355 | *
|
---|
356 | * This page is about syntax trees.
|
---|
357 | */
|
---|
358 |
|
---|
359 | enum rexp_node_type
|
---|
360 | {
|
---|
361 | r_cset, /* Match from a character set. `a' or `[a-z]'*/
|
---|
362 | r_concat, /* Concat two subexpressions. `ab' */
|
---|
363 | r_alternate, /* Choose one of two subexpressions. `a\|b' */
|
---|
364 | r_opt, /* Optional subexpression. `a?' */
|
---|
365 | r_star, /* Repeated subexpression. `a*' */
|
---|
366 |
|
---|
367 |
|
---|
368 | /* A 2phase-star is a variation on a repeated subexpression.
|
---|
369 | * In this case, there are two subexpressions. The first, if matched,
|
---|
370 | * begins a repitition (otherwise, the whole expression is matches the
|
---|
371 | * empth string).
|
---|
372 | *
|
---|
373 | * After matching the first subexpression, a 2phase star either finishes,
|
---|
374 | * or matches the second subexpression. If the second subexpression is
|
---|
375 | * matched, then the whole construct repeats.
|
---|
376 | *
|
---|
377 | * 2phase stars are used in two circumstances. First, they
|
---|
378 | * are used as part of the implementation of POSIX intervals (counted
|
---|
379 | * repititions). Second, they are used to implement proper star
|
---|
380 | * semantics when the repeated subexpression contains paths of
|
---|
381 | * only side effects. See rx_compile for more information.
|
---|
382 | */
|
---|
383 | r_2phase_star,
|
---|
384 |
|
---|
385 |
|
---|
386 | /* c.f. "typedef void * rx_side_effect" */
|
---|
387 | r_side_effect,
|
---|
388 |
|
---|
389 | /* This is an extension type: It is for transient use in source->source
|
---|
390 | * transformations (implemented over syntax trees).
|
---|
391 | */
|
---|
392 | r_data
|
---|
393 | };
|
---|
394 |
|
---|
395 | /* A side effect is a matcher-specific action associated with
|
---|
396 | * transitions in the NFA. The details of side effects are up
|
---|
397 | * to the matcher. To the compiler and superstate constructors
|
---|
398 | * side effects are opaque:
|
---|
399 | */
|
---|
400 |
|
---|
401 | typedef void * rx_side_effect;
|
---|
402 |
|
---|
403 | /* Nodes in a syntax tree are of this type:
|
---|
404 | */
|
---|
405 | struct rexp_node
|
---|
406 | {
|
---|
407 | enum rexp_node_type type;
|
---|
408 | union
|
---|
409 | {
|
---|
410 | rx_Bitset cset;
|
---|
411 | rx_side_effect side_effect;
|
---|
412 | struct
|
---|
413 | {
|
---|
414 | struct rexp_node *left;
|
---|
415 | struct rexp_node *right;
|
---|
416 | } pair;
|
---|
417 | void * data;
|
---|
418 | } params;
|
---|
419 | };
|
---|
420 |
|
---|
421 |
|
---|
422 | |
---|
423 |
|
---|
424 | /* NFA
|
---|
425 | *
|
---|
426 | * A syntax tree is compiled into an NFA. This page defines the structure
|
---|
427 | * of that NFA.
|
---|
428 | */
|
---|
429 |
|
---|
430 | struct rx_nfa_state
|
---|
431 | {
|
---|
432 | /* These are kept in a list as the NFA is being built. */
|
---|
433 | struct rx_nfa_state *next;
|
---|
434 |
|
---|
435 | /* After the NFA is built, states are given integer id's.
|
---|
436 | * States whose outgoing transitions are all either epsilon or
|
---|
437 | * side effect edges are given ids less than 0. Other states
|
---|
438 | * are given successive non-negative ids starting from 0.
|
---|
439 | */
|
---|
440 | int id;
|
---|
441 |
|
---|
442 | /* The list of NFA edges that go from this state to some other. */
|
---|
443 | struct rx_nfa_edge *edges;
|
---|
444 |
|
---|
445 | /* If you land in this state, then you implicitly land
|
---|
446 | * in all other states reachable by only epsilon translations.
|
---|
447 | * Call the set of maximal paths to such states the epsilon closure
|
---|
448 | * of this state.
|
---|
449 | *
|
---|
450 | * There may be other states that are reachable by a mixture of
|
---|
451 | * epsilon and side effect edges. Consider the set of maximal paths
|
---|
452 | * of that sort from this state. Call it the epsilon-side-effect
|
---|
453 | * closure of the state.
|
---|
454 | *
|
---|
455 | * The epsilon closure of the state is a subset of the epsilon-side-
|
---|
456 | * effect closure. It consists of all the paths that contain
|
---|
457 | * no side effects -- only epsilon edges.
|
---|
458 | *
|
---|
459 | * The paths in the epsilon-side-effect closure can be partitioned
|
---|
460 | * into equivalance sets. Two paths are equivalant if they have the
|
---|
461 | * same set of side effects, in the same order. The epsilon-closure
|
---|
462 | * is one of these equivalance sets. Let's call these equivalance
|
---|
463 | * sets: observably equivalant path sets. That name is chosen
|
---|
464 | * because equivalance of two paths means they cause the same side
|
---|
465 | * effects -- so they lead to the same subsequent observations other
|
---|
466 | * than that they may wind up in different target states.
|
---|
467 | *
|
---|
468 | * The superstate nfa, which is derived from this nfa, is based on
|
---|
469 | * the observation that all of the paths in an observably equivalant
|
---|
470 | * path set can be explored at the same time, provided that the
|
---|
471 | * matcher keeps track not of a single nfa state, but of a set of
|
---|
472 | * states. In particular, after following all the paths in an
|
---|
473 | * observably equivalant set, you wind up at a set of target states.
|
---|
474 | * That set of target states corresponds to one state in the
|
---|
475 | * superstate NFA.
|
---|
476 | *
|
---|
477 | * Staticly, before matching begins, it is convenient to analyze the
|
---|
478 | * nfa. Each state is labeled with a list of the observably
|
---|
479 | * equivalant path sets who's union covers all the
|
---|
480 | * epsilon-side-effect paths beginning in this state. This list is
|
---|
481 | * called the possible futures of the state.
|
---|
482 | *
|
---|
483 | * A trivial example is this NFA:
|
---|
484 | * s1
|
---|
485 | * A ---> B
|
---|
486 | *
|
---|
487 | * s2
|
---|
488 | * ---> C
|
---|
489 | *
|
---|
490 | * epsilon s1
|
---|
491 | * ---------> D ------> E
|
---|
492 | *
|
---|
493 | *
|
---|
494 | * In this example, A has two possible futures.
|
---|
495 | * One invokes the side effect `s1' and contains two paths,
|
---|
496 | * one ending in state B, the other in state E.
|
---|
497 | * The other invokes the side effect `s2' and contains only
|
---|
498 | * one path, landing in state C.
|
---|
499 | */
|
---|
500 | struct rx_possible_future *futures;
|
---|
501 |
|
---|
502 |
|
---|
503 | /* There are exactly two distinguished states in every NFA: */
|
---|
504 | unsigned int is_final:1;
|
---|
505 | unsigned int is_start:1;
|
---|
506 |
|
---|
507 | /* These are used during NFA construction... */
|
---|
508 | unsigned int eclosure_needed:1;
|
---|
509 | unsigned int mark:1;
|
---|
510 | };
|
---|
511 |
|
---|
512 |
|
---|
513 | /* An edge in an NFA is typed: */
|
---|
514 | enum rx_nfa_etype
|
---|
515 | {
|
---|
516 | /* A cset edge is labled with a set of characters one of which
|
---|
517 | * must be matched for the edge to be taken.
|
---|
518 | */
|
---|
519 | ne_cset,
|
---|
520 |
|
---|
521 | /* An epsilon edge is taken whenever its starting state is
|
---|
522 | * reached.
|
---|
523 | */
|
---|
524 | ne_epsilon,
|
---|
525 |
|
---|
526 | /* A side effect edge is taken whenever its starting state is
|
---|
527 | * reached. Side effects may cause the match to fail or the
|
---|
528 | * position of the matcher to advance.
|
---|
529 | */
|
---|
530 | ne_side_effect /* A special kind of epsilon. */
|
---|
531 | };
|
---|
532 |
|
---|
533 | struct rx_nfa_edge
|
---|
534 | {
|
---|
535 | struct rx_nfa_edge *next;
|
---|
536 | enum rx_nfa_etype type;
|
---|
537 | struct rx_nfa_state *dest;
|
---|
538 | union
|
---|
539 | {
|
---|
540 | rx_Bitset cset;
|
---|
541 | rx_side_effect side_effect;
|
---|
542 | } params;
|
---|
543 | };
|
---|
544 |
|
---|
545 |
|
---|
546 |
|
---|
547 | /* A possible future consists of a list of side effects
|
---|
548 | * and a set of destination states. Below are their
|
---|
549 | * representations. These structures are hash-consed which
|
---|
550 | * means that lists with the same elements share a representation
|
---|
551 | * (their addresses are ==).
|
---|
552 | */
|
---|
553 |
|
---|
554 | struct rx_nfa_state_set
|
---|
555 | {
|
---|
556 | struct rx_nfa_state * car;
|
---|
557 | struct rx_nfa_state_set * cdr;
|
---|
558 | };
|
---|
559 |
|
---|
560 | struct rx_se_list
|
---|
561 | {
|
---|
562 | rx_side_effect car;
|
---|
563 | struct rx_se_list * cdr;
|
---|
564 | };
|
---|
565 |
|
---|
566 | struct rx_possible_future
|
---|
567 | {
|
---|
568 | struct rx_possible_future *next;
|
---|
569 | struct rx_se_list * effects;
|
---|
570 | struct rx_nfa_state_set * destset;
|
---|
571 | };
|
---|
572 |
|
---|
573 | |
---|
574 |
|
---|
575 |
|
---|
576 | /* This begins the description of the superstate NFA.
|
---|
577 | *
|
---|
578 | * The superstate NFA corresponds to the NFA in these ways:
|
---|
579 | *
|
---|
580 | * Every superstate NFA states SUPER correspond to sets of NFA states,
|
---|
581 | * nfa_states(SUPER).
|
---|
582 | *
|
---|
583 | * Superstate edges correspond to NFA paths.
|
---|
584 | *
|
---|
585 | * The superstate has no epsilon transitions;
|
---|
586 | * every edge has a character label, and a (possibly empty) side
|
---|
587 | * effect label. The side effect label corresponds to a list of
|
---|
588 | * side effects that occur in the NFA. These parts are referred
|
---|
589 | * to as: superedge_character(EDGE) and superedge_sides(EDGE).
|
---|
590 | *
|
---|
591 | * For a superstate edge EDGE starting in some superstate SUPER,
|
---|
592 | * the following is true (in pseudo-notation :-):
|
---|
593 | *
|
---|
594 | * exists DEST in nfa_states s.t.
|
---|
595 | * exists nfaEDGE in nfa_edges s.t.
|
---|
596 | * origin (nfaEDGE) == DEST
|
---|
597 | * && origin (nfaEDGE) is a member of nfa_states(SUPER)
|
---|
598 | * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
|
---|
599 | * sides_of_possible_future (PF) == superedge_sides (EDGE)
|
---|
600 | *
|
---|
601 | * also:
|
---|
602 | *
|
---|
603 | * let SUPER2 := superedge_destination(EDGE)
|
---|
604 | * nfa_states(SUPER2)
|
---|
605 | * == union of all nfa state sets S s.t.
|
---|
606 | * exists PF in possible_futures(dest(nfaEDGE)) s.t.
|
---|
607 | * sides_of_possible_future (PF) == superedge_sides (EDGE)
|
---|
608 | * && S == dests_of_possible_future (PF) }
|
---|
609 | *
|
---|
610 | * Or in english, every superstate is a set of nfa states. A given
|
---|
611 | * character and a superstate implies many transitions in the NFA --
|
---|
612 | * those that begin with an edge labeled with that character from a
|
---|
613 | * state in the set corresponding to the superstate.
|
---|
614 | *
|
---|
615 | * The destinations of those transitions each have a set of possible
|
---|
616 | * futures. A possible future is a list of side effects and a set of
|
---|
617 | * destination NFA states. Two sets of possible futures can be
|
---|
618 | * `merged' by combining all pairs of possible futures that have the
|
---|
619 | * same side effects. A pair is combined by creating a new future
|
---|
620 | * with the same side effect but the union of the two destination sets.
|
---|
621 | * In this way, all the possible futures suggested by a superstate
|
---|
622 | * and a character can be merged into a set of possible futures where
|
---|
623 | * no two elements of the set have the same set of side effects.
|
---|
624 | *
|
---|
625 | * The destination of a possible future, being a set of NFA states,
|
---|
626 | * corresponds to a supernfa state. So, the merged set of possible
|
---|
627 | * futures we just created can serve as a set of edges in the
|
---|
628 | * supernfa.
|
---|
629 | *
|
---|
630 | * The representation of the superstate nfa and the nfa is critical.
|
---|
631 | * The nfa has to be compact, but has to facilitate the rapid
|
---|
632 | * computation of missing superstates. The superstate nfa has to
|
---|
633 | * be fast to interpret, lazilly constructed, and bounded in space.
|
---|
634 | *
|
---|
635 | * To facilitate interpretation, the superstate data structures are
|
---|
636 | * peppered with `instruction frames'. There is an instruction set
|
---|
637 | * defined below which matchers using the supernfa must be able to
|
---|
638 | * interpret.
|
---|
639 | *
|
---|
640 | * We'd like to make it possible but not mandatory to use code
|
---|
641 | * addresses to represent instructions (c.f. gcc's computed goto).
|
---|
642 | * Therefore, we define an enumerated type of opcodes, and when
|
---|
643 | * writing one of these instructions into a data structure, use
|
---|
644 | * the opcode as an index into a table of instruction values.
|
---|
645 | *
|
---|
646 | * Here are the opcodes that occur in the superstate nfa:
|
---|
647 | */
|
---|
648 |
|
---|
649 |
|
---|
650 | /* Every superstate contains a table of instruction frames indexed
|
---|
651 | * by characters. A normal `move' in a matcher is to fetch the next
|
---|
652 | * character and use it as an index into a superstates transition
|
---|
653 | * table.
|
---|
654 | *
|
---|
655 | * In the fasted case, only one edge follows from that character.
|
---|
656 | * In other cases there is more work to do.
|
---|
657 | *
|
---|
658 | * The descriptions of the opcodes refer to data structures that are
|
---|
659 | * described further below.
|
---|
660 | */
|
---|
661 |
|
---|
662 | enum rx_opcode
|
---|
663 | {
|
---|
664 | /*
|
---|
665 | * BACKTRACK_POINT is invoked when a character transition in
|
---|
666 | * a superstate leads to more than one edge. In that case,
|
---|
667 | * the edges have to be explored independently using a backtracking
|
---|
668 | * strategy.
|
---|
669 | *
|
---|
670 | * A BACKTRACK_POINT instruction is stored in a superstate's
|
---|
671 | * transition table for some character when it is known that that
|
---|
672 | * character crosses more than one edge. On encountering this
|
---|
673 | * instruction, the matcher saves enough state to backtrack to this
|
---|
674 | * point in the match later.
|
---|
675 | */
|
---|
676 | rx_backtrack_point = 0, /* data is (struct transition_class *) */
|
---|
677 |
|
---|
678 | /*
|
---|
679 | * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
|
---|
680 | * There is one occurence of this instruction per rx_distinct_future.
|
---|
681 | * This instruction is skipped if a rx_distinct_future has no side effects.
|
---|
682 | */
|
---|
683 | rx_do_side_effects = rx_backtrack_point + 1,
|
---|
684 |
|
---|
685 | /* data is (struct rx_distinct_future *) */
|
---|
686 |
|
---|
687 | /*
|
---|
688 | * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
|
---|
689 | * destination superstate has been reclaimed (or was never built).
|
---|
690 | * It recomputes the destination superstate.
|
---|
691 | * RX_CACHE_MISS is also stored in a superstate transition table before
|
---|
692 | * any of its edges have been built.
|
---|
693 | */
|
---|
694 | rx_cache_miss = rx_do_side_effects + 1,
|
---|
695 | /* data is (struct rx_distinct_future *) */
|
---|
696 |
|
---|
697 | /*
|
---|
698 | * RX_NEXT_CHAR is called to consume the next character and take the
|
---|
699 | * corresponding transition. This is the only instruction that uses
|
---|
700 | * the DATA field of the instruction frame instead of DATA_2.
|
---|
701 | * (see EXPLORE_FUTURE in regex.c).
|
---|
702 | */
|
---|
703 | rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
|
---|
704 |
|
---|
705 | /* RX_BACKTRACK indicates that a transition fails.
|
---|
706 | */
|
---|
707 | rx_backtrack = rx_next_char + 1, /* no data */
|
---|
708 |
|
---|
709 | /*
|
---|
710 | * RX_ERROR_INX is stored only in places that should never be executed.
|
---|
711 | */
|
---|
712 | rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
|
---|
713 |
|
---|
714 | rx_num_instructions = rx_error_inx + 1
|
---|
715 | };
|
---|
716 |
|
---|
717 | /* An id_instruction_table holds the values stored in instruction
|
---|
718 | * frames. The table is indexed by the enums declared above.
|
---|
719 | */
|
---|
720 | extern void * rx_id_instruction_table[rx_num_instructions];
|
---|
721 |
|
---|
722 | /* The heart of the matcher is a `word-code-interpreter'
|
---|
723 | * (like a byte-code interpreter, except that instructions
|
---|
724 | * are a full word wide).
|
---|
725 | *
|
---|
726 | * Instructions are not stored in a vector of code, instead,
|
---|
727 | * they are scattered throughout the data structures built
|
---|
728 | * by the regexp compiler and the matcher. One word-code instruction,
|
---|
729 | * together with the arguments to that instruction, constitute
|
---|
730 | * an instruction frame (struct rx_inx).
|
---|
731 | *
|
---|
732 | * This structure type is padded by hand to a power of 2 because
|
---|
733 | * in one of the dominant cases, we dispatch by indexing a table
|
---|
734 | * of instruction frames. If that indexing can be accomplished
|
---|
735 | * by just a shift of the index, we're happy.
|
---|
736 | *
|
---|
737 | * Instructions take at most one argument, but there are two
|
---|
738 | * slots in an instruction frame that might hold that argument.
|
---|
739 | * These are called data and data_2. The data slot is only
|
---|
740 | * used for one instruction (RX_NEXT_CHAR). For all other
|
---|
741 | * instructions, data should be set to 0.
|
---|
742 | *
|
---|
743 | * RX_NEXT_CHAR is the most important instruction by far.
|
---|
744 | * By reserving the data field for its exclusive use,
|
---|
745 | * instruction dispatch is sped up in that case. There is
|
---|
746 | * no need to fetch both the instruction and the data,
|
---|
747 | * only the data is needed. In other words, a `cycle' begins
|
---|
748 | * by fetching the field data. If that is non-0, then it must
|
---|
749 | * be the destination state of a next_char transition, so
|
---|
750 | * make that value the current state, advance the match position
|
---|
751 | * by one character, and start a new cycle. On the other hand,
|
---|
752 | * if data is 0, fetch the instruction and do a more complicated
|
---|
753 | * dispatch on that.
|
---|
754 | */
|
---|
755 |
|
---|
756 | struct rx_inx
|
---|
757 | {
|
---|
758 | void * data;
|
---|
759 | void * data_2;
|
---|
760 | void * inx;
|
---|
761 | void * fnord;
|
---|
762 | };
|
---|
763 |
|
---|
764 | #ifndef RX_TAIL_ARRAY
|
---|
765 | #define RX_TAIL_ARRAY 1
|
---|
766 | #endif
|
---|
767 |
|
---|
768 | /* A superstate corresponds to a set of nfa states. Those sets are
|
---|
769 | * represented by STRUCT RX_SUPERSET. The constructors
|
---|
770 | * guarantee that only one (shared) structure is created for a given set.
|
---|
771 | */
|
---|
772 | struct rx_superset
|
---|
773 | {
|
---|
774 | int refs; /* This is a reference counted structure. */
|
---|
775 |
|
---|
776 | /* We keep these sets in a cache because (in an unpredictable way),
|
---|
777 | * the same set is often created again and again. But that is also
|
---|
778 | * problematic -- compatibility with POSIX and GNU regex requires
|
---|
779 | * that we not be able to tell when a program discards a particular
|
---|
780 | * NFA (thus invalidating the supersets created from it).
|
---|
781 | *
|
---|
782 | * But when a cache hit appears to occur, we will have in hand the
|
---|
783 | * nfa for which it may have happened. That is why every nfa is given
|
---|
784 | * its own sequence number. On a cache hit, the cache is validated
|
---|
785 | * by comparing the nfa sequence number to this field:
|
---|
786 | */
|
---|
787 | int id;
|
---|
788 |
|
---|
789 | struct rx_nfa_state * car; /* May or may not be a valid addr. */
|
---|
790 | struct rx_superset * cdr;
|
---|
791 |
|
---|
792 | /* If the corresponding superstate exists: */
|
---|
793 | struct rx_superstate * superstate;
|
---|
794 |
|
---|
795 |
|
---|
796 | /* There is another bookkeeping problem. It is expensive to
|
---|
797 | * compute the starting nfa state set for an nfa. So, once computed,
|
---|
798 | * it is cached in the `struct rx'.
|
---|
799 | *
|
---|
800 | * But, the state set can be flushed from the superstate cache.
|
---|
801 | * When that happens, we can't know if the corresponding `struct rx'
|
---|
802 | * is still alive or if it has been freed or re-used by the program.
|
---|
803 | * So, the cached pointer to this set in a struct rx might be invalid
|
---|
804 | * and we need a way to validate it.
|
---|
805 | *
|
---|
806 | * Fortunately, even if this set is flushed from the cache, it is
|
---|
807 | * not freed. It just goes on the free-list of supersets.
|
---|
808 | * So we can still examine it.
|
---|
809 | *
|
---|
810 | * So to validate a starting set memo, check to see if the
|
---|
811 | * starts_for field still points back to the struct rx in question,
|
---|
812 | * and if the ID matches the rx sequence number.
|
---|
813 | */
|
---|
814 | struct rx * starts_for;
|
---|
815 |
|
---|
816 | /* This is used to link into a hash bucket so these objects can
|
---|
817 | * be `hash-consed'.
|
---|
818 | */
|
---|
819 | struct rx_hash_item hash_item;
|
---|
820 | };
|
---|
821 |
|
---|
822 | #define rx_protect_superset(RX,CON) (++(CON)->refs)
|
---|
823 |
|
---|
824 | /* The terminology may be confusing (rename this structure?).
|
---|
825 | * Every character occurs in at most one rx_super_edge per super-state.
|
---|
826 | * But, that structure might have more than one option, indicating a point
|
---|
827 | * of non-determinism.
|
---|
828 | *
|
---|
829 | * In other words, this structure holds a list of superstate edges
|
---|
830 | * sharing a common starting state and character label. The edges
|
---|
831 | * are in the field OPTIONS. All superstate edges sharing the same
|
---|
832 | * starting state and character are in this list.
|
---|
833 | */
|
---|
834 | struct rx_super_edge
|
---|
835 | {
|
---|
836 | struct rx_super_edge *next;
|
---|
837 | struct rx_inx rx_backtrack_frame;
|
---|
838 | int cset_size;
|
---|
839 | rx_Bitset cset;
|
---|
840 | struct rx_distinct_future *options;
|
---|
841 | };
|
---|
842 |
|
---|
843 | /* A superstate is a set of nfa states (RX_SUPERSET) along
|
---|
844 | * with a transition table. Superstates are built on demand and reclaimed
|
---|
845 | * without warning. To protect a superstate from this ghastly fate,
|
---|
846 | * use LOCK_SUPERSTATE.
|
---|
847 | */
|
---|
848 | struct rx_superstate
|
---|
849 | {
|
---|
850 | int rx_id; /* c.f. the id field of rx_superset */
|
---|
851 | int locks; /* protection from reclamation */
|
---|
852 |
|
---|
853 | /* Within a superstate cache, all the superstates are kept in a big
|
---|
854 | * queue. The tail of the queue is the state most likely to be
|
---|
855 | * reclaimed. The *recyclable fields hold the queue position of
|
---|
856 | * this state.
|
---|
857 | */
|
---|
858 | struct rx_superstate * next_recyclable;
|
---|
859 | struct rx_superstate * prev_recyclable;
|
---|
860 |
|
---|
861 | /* The supernfa edges that exist in the cache and that have
|
---|
862 | * this state as their destination are kept in this list:
|
---|
863 | */
|
---|
864 | struct rx_distinct_future * transition_refs;
|
---|
865 |
|
---|
866 | /* The list of nfa states corresponding to this superstate: */
|
---|
867 | struct rx_superset * contents;
|
---|
868 |
|
---|
869 | /* The list of edges in the cache beginning from this state. */
|
---|
870 | struct rx_super_edge * edges;
|
---|
871 |
|
---|
872 | /* A tail of the recyclable queue is marked as semifree. A semifree
|
---|
873 | * state has no incoming next_char transitions -- any transition
|
---|
874 | * into a semifree state causes a complex dispatch with the side
|
---|
875 | * effect of rescuing the state from its semifree state.
|
---|
876 | *
|
---|
877 | * An alternative to this might be to make next_char more expensive,
|
---|
878 | * and to move a state to the head of the recyclable queue whenever
|
---|
879 | * it is entered. That way, popular states would never be recycled.
|
---|
880 | *
|
---|
881 | * But unilaterally making next_char more expensive actually loses.
|
---|
882 | * So, incoming transitions are only made expensive for states near
|
---|
883 | * the tail of the recyclable queue. The more cache contention
|
---|
884 | * there is, the more frequently a state will have to prove itself
|
---|
885 | * and be moved back to the front of the queue. If there is less
|
---|
886 | * contention, then popular states just aggregate in the front of
|
---|
887 | * the queue and stay there.
|
---|
888 | */
|
---|
889 | int is_semifree;
|
---|
890 |
|
---|
891 |
|
---|
892 | /* This keeps track of the size of the transition table for this
|
---|
893 | * state. There is a half-hearted attempt to support variable sized
|
---|
894 | * superstates.
|
---|
895 | */
|
---|
896 | int trans_size;
|
---|
897 |
|
---|
898 | /* Indexed by characters... */
|
---|
899 | struct rx_inx transitions[RX_TAIL_ARRAY];
|
---|
900 | };
|
---|
901 |
|
---|
902 |
|
---|
903 | /* A list of distinct futures define the edges that leave from a
|
---|
904 | * given superstate on a given character. c.f. rx_super_edge.
|
---|
905 | */
|
---|
906 |
|
---|
907 | struct rx_distinct_future
|
---|
908 | {
|
---|
909 | struct rx_distinct_future * next_same_super_edge[2];
|
---|
910 | struct rx_distinct_future * next_same_dest;
|
---|
911 | struct rx_distinct_future * prev_same_dest;
|
---|
912 | struct rx_superstate * present; /* source state */
|
---|
913 | struct rx_superstate * future; /* destination state */
|
---|
914 | struct rx_super_edge * edge;
|
---|
915 |
|
---|
916 |
|
---|
917 | /* The future_frame holds the instruction that should be executed
|
---|
918 | * after all the side effects are done, when it is time to complete
|
---|
919 | * the transition to the next state.
|
---|
920 | *
|
---|
921 | * Normally this is a next_char instruction, but it may be a
|
---|
922 | * cache_miss instruction as well, depending on whether or not
|
---|
923 | * the superstate is in the cache and semifree.
|
---|
924 | *
|
---|
925 | * If this is the only future for a given superstate/char, and
|
---|
926 | * if there are no side effects to be performed, this frame is
|
---|
927 | * not used (directly) at all. Instead, its contents are copied
|
---|
928 | * into the transition table of the starting state of this dist. future.
|
---|
929 | */
|
---|
930 | struct rx_inx future_frame;
|
---|
931 |
|
---|
932 | struct rx_inx side_effects_frame;
|
---|
933 | struct rx_se_list * effects;
|
---|
934 | };
|
---|
935 |
|
---|
936 | #define rx_lock_superstate(R,S) ((S)->locks++)
|
---|
937 | #define rx_unlock_superstate(R,S) (--(S)->locks)
|
---|
938 |
|
---|
939 | |
---|
940 |
|
---|
941 | /* This page destined for rx.h */
|
---|
942 |
|
---|
943 | struct rx_blocklist
|
---|
944 | {
|
---|
945 | struct rx_blocklist * next;
|
---|
946 | int bytes;
|
---|
947 | };
|
---|
948 |
|
---|
949 | struct rx_freelist
|
---|
950 | {
|
---|
951 | struct rx_freelist * next;
|
---|
952 | };
|
---|
953 |
|
---|
954 | struct rx_cache;
|
---|
955 |
|
---|
956 | #ifdef __STDC__
|
---|
957 | typedef void (*rx_morecore_fn)(struct rx_cache *);
|
---|
958 | #else
|
---|
959 | typedef void (*rx_morecore_fn)();
|
---|
960 | #endif
|
---|
961 |
|
---|
962 | /* You use this to control the allocation of superstate data
|
---|
963 | * during matching. Most of it should be initialized to 0.
|
---|
964 | *
|
---|
965 | * A MORECORE function is necessary. It should allocate
|
---|
966 | * a new block of memory or return 0.
|
---|
967 | * A default that uses malloc is called `rx_morecore'.
|
---|
968 | *
|
---|
969 | * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
|
---|
970 | * the system will try to allocate. The default is 128. Batch style
|
---|
971 | * applications that are very regexp intensive should use as high a number
|
---|
972 | * as possible without thrashing.
|
---|
973 | *
|
---|
974 | * The LOCAL_CSET_SIZE is the number of characters in a character set.
|
---|
975 | * It is therefore the number of entries in a superstate transition table.
|
---|
976 | * Generally, it should be 256. If your character set has 16 bits,
|
---|
977 | * it is better to translate your regexps into equivalent 8 bit patterns.
|
---|
978 | */
|
---|
979 |
|
---|
980 | struct rx_cache
|
---|
981 | {
|
---|
982 | struct rx_hash_rules superset_hash_rules;
|
---|
983 |
|
---|
984 | /* Objects are allocated by incrementing a pointer that
|
---|
985 | * scans across rx_blocklists.
|
---|
986 | */
|
---|
987 | struct rx_blocklist * memory;
|
---|
988 | struct rx_blocklist * memory_pos;
|
---|
989 | int bytes_left;
|
---|
990 | char * memory_addr;
|
---|
991 | rx_morecore_fn morecore;
|
---|
992 |
|
---|
993 | /* Freelists. */
|
---|
994 | struct rx_freelist * free_superstates;
|
---|
995 | struct rx_freelist * free_transition_classes;
|
---|
996 | struct rx_freelist * free_discernable_futures;
|
---|
997 | struct rx_freelist * free_supersets;
|
---|
998 | struct rx_freelist * free_hash;
|
---|
999 |
|
---|
1000 | /* Two sets of superstates -- those that are semifreed, and those
|
---|
1001 | * that are being used.
|
---|
1002 | */
|
---|
1003 | struct rx_superstate * lru_superstate;
|
---|
1004 | struct rx_superstate * semifree_superstate;
|
---|
1005 |
|
---|
1006 | struct rx_superset * empty_superset;
|
---|
1007 |
|
---|
1008 | int superstates;
|
---|
1009 | int semifree_superstates;
|
---|
1010 | int hits;
|
---|
1011 | int misses;
|
---|
1012 | int superstates_allowed;
|
---|
1013 |
|
---|
1014 | int local_cset_size;
|
---|
1015 | void ** instruction_table;
|
---|
1016 |
|
---|
1017 | struct rx_hash superset_table;
|
---|
1018 | };
|
---|
1019 |
|
---|
1020 | |
---|
1021 |
|
---|
1022 |
|
---|
1023 | /* The lowest-level search function supports arbitrarily fragmented
|
---|
1024 | * strings and (optionally) suspendable/resumable searches.
|
---|
1025 | *
|
---|
1026 | * Callers have to provide a few hooks.
|
---|
1027 | */
|
---|
1028 |
|
---|
1029 | #ifndef __GNUC__
|
---|
1030 | #ifdef __STDC__
|
---|
1031 | #define __const__ const
|
---|
1032 | #else
|
---|
1033 | #define __const__
|
---|
1034 | #endif
|
---|
1035 | #endif
|
---|
1036 |
|
---|
1037 | /* This holds a matcher position */
|
---|
1038 | struct rx_string_position
|
---|
1039 | {
|
---|
1040 | __const__ unsigned char * pos; /* The current pos. */
|
---|
1041 | __const__ unsigned char * string; /* The current string burst. */
|
---|
1042 | __const__ unsigned char * end; /* First invalid position >= POS. */
|
---|
1043 | int offset; /* Integer address of the current burst. */
|
---|
1044 | int size; /* Current string's size. */
|
---|
1045 | int search_direction; /* 1 or -1 */
|
---|
1046 | int search_end; /* First position to not try. */
|
---|
1047 | };
|
---|
1048 |
|
---|
1049 |
|
---|
1050 | enum rx_get_burst_return
|
---|
1051 | {
|
---|
1052 | rx_get_burst_continuation,
|
---|
1053 | rx_get_burst_error,
|
---|
1054 | rx_get_burst_ok,
|
---|
1055 | rx_get_burst_no_more
|
---|
1056 | };
|
---|
1057 |
|
---|
1058 |
|
---|
1059 | /* A call to get burst should make POS valid. It might be invalid
|
---|
1060 | * if the STRING field doesn't point to a burst that actually
|
---|
1061 | * contains POS.
|
---|
1062 | *
|
---|
1063 | * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to
|
---|
1064 | * whether or not to pad to the left. Padding to the right is always
|
---|
1065 | * appropriate, but need not go past the point indicated by STOP.
|
---|
1066 | *
|
---|
1067 | * If a continuation is returned, then the reentering call to
|
---|
1068 | * a search function will retry the get_burst.
|
---|
1069 | */
|
---|
1070 |
|
---|
1071 | #ifdef __STDC__
|
---|
1072 | typedef enum rx_get_burst_return
|
---|
1073 | (*rx_get_burst_fn) (struct rx_string_position * pos,
|
---|
1074 | void * app_closure,
|
---|
1075 | int stop);
|
---|
1076 |
|
---|
1077 | #else
|
---|
1078 | typedef enum rx_get_burst_return (*rx_get_burst_fn) ();
|
---|
1079 | #endif
|
---|
1080 |
|
---|
1081 |
|
---|
1082 | enum rx_back_check_return
|
---|
1083 | {
|
---|
1084 | rx_back_check_continuation,
|
---|
1085 | rx_back_check_error,
|
---|
1086 | rx_back_check_pass,
|
---|
1087 | rx_back_check_fail
|
---|
1088 | };
|
---|
1089 |
|
---|
1090 | /* Back_check should advance the position it is passed
|
---|
1091 | * over rparen - lparen characters and return pass iff
|
---|
1092 | * the characters starting at POS match those indexed
|
---|
1093 | * by [LPAREN..RPAREN].
|
---|
1094 | *
|
---|
1095 | * If a continuation is returned, then the reentering call to
|
---|
1096 | * a search function will retry the back_check.
|
---|
1097 | */
|
---|
1098 |
|
---|
1099 | #ifdef __STDC__
|
---|
1100 | typedef enum rx_back_check_return
|
---|
1101 | (*rx_back_check_fn) (struct rx_string_position * pos,
|
---|
1102 | int lparen,
|
---|
1103 | int rparen,
|
---|
1104 | unsigned char * translate,
|
---|
1105 | void * app_closure,
|
---|
1106 | int stop);
|
---|
1107 |
|
---|
1108 | #else
|
---|
1109 | typedef enum rx_back_check_return (*rx_back_check_fn) ();
|
---|
1110 | #endif
|
---|
1111 |
|
---|
1112 |
|
---|
1113 |
|
---|
1114 |
|
---|
1115 | /* A call to fetch_char should return the character at POS or POS + 1.
|
---|
1116 | * Returning continuations here isn't supported. OFFSET is either 0 or 1
|
---|
1117 | * and indicates which characters is desired.
|
---|
1118 | */
|
---|
1119 |
|
---|
1120 | #ifdef __STDC__
|
---|
1121 | typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos,
|
---|
1122 | int offset,
|
---|
1123 | void * app_closure,
|
---|
1124 | int stop);
|
---|
1125 | #else
|
---|
1126 | typedef int (*rx_fetch_char_fn) ();
|
---|
1127 | #endif
|
---|
1128 |
|
---|
1129 |
|
---|
1130 | enum rx_search_return
|
---|
1131 | {
|
---|
1132 | rx_search_continuation = -4,
|
---|
1133 | rx_search_error = -3,
|
---|
1134 | rx_search_soft_fail = -2, /* failed by running out of string */
|
---|
1135 | rx_search_fail = -1 /* failed only by reaching failure states */
|
---|
1136 | /* return values >= 0 indicate the position of a successful match */
|
---|
1137 | };
|
---|
1138 |
|
---|
1139 |
|
---|
1140 |
|
---|
1141 |
|
---|
1142 | |
---|
1143 |
|
---|
1144 |
|
---|
1145 | /* regex.h
|
---|
1146 | *
|
---|
1147 | * The remaining declarations replace regex.h.
|
---|
1148 | */
|
---|
1149 |
|
---|
1150 | /* This is an array of error messages corresponding to the error codes.
|
---|
1151 | */
|
---|
1152 | extern __const__ char *re_error_msg[];
|
---|
1153 |
|
---|
1154 | /* If any error codes are removed, changed, or added, update the
|
---|
1155 | `re_error_msg' table in regex.c. */
|
---|
1156 | typedef enum
|
---|
1157 | {
|
---|
1158 | REG_NOERROR = 0, /* Success. */
|
---|
1159 | REG_NOMATCH, /* Didn't find a match (for regexec). */
|
---|
1160 |
|
---|
1161 | /* POSIX regcomp return error codes. (In the order listed in the
|
---|
1162 | standard.) */
|
---|
1163 | REG_BADPAT, /* Invalid pattern. */
|
---|
1164 | REG_ECOLLATE, /* Not implemented. */
|
---|
1165 | REG_ECTYPE, /* Invalid character class name. */
|
---|
1166 | REG_EESCAPE, /* Trailing backslash. */
|
---|
1167 | REG_ESUBREG, /* Invalid back reference. */
|
---|
1168 | REG_EBRACK, /* Unmatched left bracket. */
|
---|
1169 | REG_EPAREN, /* Parenthesis imbalance. */
|
---|
1170 | REG_EBRACE, /* Unmatched \{. */
|
---|
1171 | REG_BADBR, /* Invalid contents of \{\}. */
|
---|
1172 | REG_ERANGE, /* Invalid range end. */
|
---|
1173 | REG_ESPACE, /* Ran out of memory. */
|
---|
1174 | REG_BADRPT, /* No preceding re for repetition op. */
|
---|
1175 |
|
---|
1176 | /* Error codes we've added. */
|
---|
1177 | REG_EEND, /* Premature end. */
|
---|
1178 | REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
|
---|
1179 | REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
|
---|
1180 | } reg_errcode_t;
|
---|
1181 |
|
---|
1182 | /* The regex.c support, as a client of rx, defines a set of possible
|
---|
1183 | * side effects that can be added to the edge lables of nfa edges.
|
---|
1184 | * Here is the list of sidef effects in use.
|
---|
1185 | */
|
---|
1186 |
|
---|
1187 | enum re_side_effects
|
---|
1188 | {
|
---|
1189 | #define RX_WANT_SE_DEFS 1
|
---|
1190 | #undef RX_DEF_SE
|
---|
1191 | #undef RX_DEF_CPLX_SE
|
---|
1192 | #define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE,
|
---|
1193 | #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE,
|
---|
1194 | #include "rx.h"
|
---|
1195 | #undef RX_DEF_SE
|
---|
1196 | #undef RX_DEF_CPLX_SE
|
---|
1197 | #undef RX_WANT_SE_DEFS
|
---|
1198 | re_floogle_flap = 65533
|
---|
1199 | };
|
---|
1200 |
|
---|
1201 | /* These hold paramaters for the kinds of side effects that are possible
|
---|
1202 | * in the supported pattern languages. These include things like the
|
---|
1203 | * numeric bounds of {} operators and the index of paren registers for
|
---|
1204 | * subexpression measurement or backreferencing.
|
---|
1205 | */
|
---|
1206 | struct re_se_params
|
---|
1207 | {
|
---|
1208 | enum re_side_effects se;
|
---|
1209 | int op1;
|
---|
1210 | int op2;
|
---|
1211 | };
|
---|
1212 |
|
---|
1213 | typedef unsigned reg_syntax_t;
|
---|
1214 |
|
---|
1215 | struct re_pattern_buffer
|
---|
1216 | {
|
---|
1217 | struct rx rx;
|
---|
1218 | reg_syntax_t syntax; /* See below for syntax bit definitions. */
|
---|
1219 |
|
---|
1220 | unsigned int no_sub:1; /* If set, don't return register offsets. */
|
---|
1221 | unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */
|
---|
1222 | unsigned int not_eol:1; /* match at the ends of the string. */
|
---|
1223 | unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/
|
---|
1224 | unsigned int least_subs:1; /* If set, and returning registers, return
|
---|
1225 | * as few values as possible. Only
|
---|
1226 | * backreferenced groups and group 0 (the whole
|
---|
1227 | * match) will be returned.
|
---|
1228 | */
|
---|
1229 |
|
---|
1230 | /* If true, this says that the matcher should keep registers on its
|
---|
1231 | * backtracking stack. For many patterns, we can easily determine that
|
---|
1232 | * this isn't necessary.
|
---|
1233 | */
|
---|
1234 | unsigned int match_regs_on_stack:1;
|
---|
1235 | unsigned int search_regs_on_stack:1;
|
---|
1236 |
|
---|
1237 | /* is_anchored and begbuf_only are filled in by rx_compile. */
|
---|
1238 | unsigned int is_anchored:1; /* Anchorded by ^? */
|
---|
1239 | unsigned int begbuf_only:1; /* Anchored to char position 0? */
|
---|
1240 |
|
---|
1241 |
|
---|
1242 | /* If REGS_UNALLOCATED, allocate space in the `regs' structure
|
---|
1243 | * for `max (RE_NREGS, re_nsub + 1)' groups.
|
---|
1244 | * If REGS_REALLOCATE, reallocate space if necessary.
|
---|
1245 | * If REGS_FIXED, use what's there.
|
---|
1246 | */
|
---|
1247 | #define REGS_UNALLOCATED 0
|
---|
1248 | #define REGS_REALLOCATE 1
|
---|
1249 | #define REGS_FIXED 2
|
---|
1250 | unsigned int regs_allocated:2;
|
---|
1251 |
|
---|
1252 |
|
---|
1253 | /* Either a translate table to apply to all characters before
|
---|
1254 | * comparing them, or zero for no translation. The translation
|
---|
1255 | * is applied to a pattern when it is compiled and to a string
|
---|
1256 | * when it is matched.
|
---|
1257 | */
|
---|
1258 | unsigned char * translate;
|
---|
1259 |
|
---|
1260 | /* If this is a valid pointer, it tells rx not to store the extents of
|
---|
1261 | * certain subexpressions (those corresponding to non-zero entries).
|
---|
1262 | * Passing 0x1 is the same as passing an array of all ones. Passing 0x0
|
---|
1263 | * is the same as passing an array of all zeros.
|
---|
1264 | * The array should contain as many entries as their are subexps in the
|
---|
1265 | * regexp.
|
---|
1266 | *
|
---|
1267 | * For POSIX compatability, when using regcomp and regexec this field
|
---|
1268 | * is zeroed and ignored.
|
---|
1269 | */
|
---|
1270 | char * syntax_parens;
|
---|
1271 |
|
---|
1272 | /* Number of subexpressions found by the compiler. */
|
---|
1273 | size_t re_nsub;
|
---|
1274 |
|
---|
1275 | void * buffer; /* Malloced memory for the nfa. */
|
---|
1276 | unsigned long allocated; /* Size of that memory. */
|
---|
1277 |
|
---|
1278 | /* Pointer to a fastmap, if any, otherwise zero. re_search uses
|
---|
1279 | * the fastmap, if there is one, to skip over impossible
|
---|
1280 | * starting points for matches. */
|
---|
1281 | char *fastmap;
|
---|
1282 |
|
---|
1283 | unsigned int fastmap_accurate:1; /* These three are internal. */
|
---|
1284 | unsigned int can_match_empty:1;
|
---|
1285 | struct rx_nfa_state * start; /* The nfa starting state. */
|
---|
1286 |
|
---|
1287 | /* This is the list of iterator bounds for {lo,hi} constructs.
|
---|
1288 | * The memory pointed to is part of the rx->buffer.
|
---|
1289 | */
|
---|
1290 | struct re_se_params *se_params;
|
---|
1291 |
|
---|
1292 | /* This is a bitset representation of the fastmap.
|
---|
1293 | * This is a true fastmap that already takes the translate
|
---|
1294 | * table into account.
|
---|
1295 | */
|
---|
1296 | rx_Bitset fastset;
|
---|
1297 | };
|
---|
1298 |
|
---|
1299 | /* Type for byte offsets within the string. POSIX mandates this. */
|
---|
1300 | typedef int regoff_t;
|
---|
1301 |
|
---|
1302 | /* This is the structure we store register match data in. See
|
---|
1303 | regex.texinfo for a full description of what registers match. */
|
---|
1304 | struct re_registers
|
---|
1305 | {
|
---|
1306 | unsigned num_regs;
|
---|
1307 | regoff_t *start;
|
---|
1308 | regoff_t *end;
|
---|
1309 | };
|
---|
1310 |
|
---|
1311 | typedef struct re_pattern_buffer regex_t;
|
---|
1312 |
|
---|
1313 | /* POSIX specification for registers. Aside from the different names than
|
---|
1314 | `re_registers', POSIX uses an array of structures, instead of a
|
---|
1315 | structure of arrays. */
|
---|
1316 | typedef struct
|
---|
1317 | {
|
---|
1318 | regoff_t rm_so; /* Byte offset from string's start to substring's start. */
|
---|
1319 | regoff_t rm_eo; /* Byte offset from string's start to substring's end. */
|
---|
1320 | } regmatch_t;
|
---|
1321 |
|
---|
1322 | |
---|
1323 |
|
---|
1324 | /* The following bits are used to determine the regexp syntax we
|
---|
1325 | recognize. The set/not-set meanings are chosen so that Emacs syntax
|
---|
1326 | remains the value 0. The bits are given in alphabetical order, and
|
---|
1327 | the definitions shifted by one from the previous bit; thus, when we
|
---|
1328 | add or remove a bit, only one other definition need change. */
|
---|
1329 |
|
---|
1330 | /* If this bit is not set, then \ inside a bracket expression is literal.
|
---|
1331 | If set, then such a \ quotes the following character. */
|
---|
1332 | #define RE_BACKSLASH_ESCAPE_IN_LISTS (1)
|
---|
1333 |
|
---|
1334 | /* If this bit is not set, then + and ? are operators, and \+ and \? are
|
---|
1335 | literals.
|
---|
1336 | If set, then \+ and \? are operators and + and ? are literals. */
|
---|
1337 | #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
|
---|
1338 |
|
---|
1339 | /* If this bit is set, then character classes are supported. They are:
|
---|
1340 | [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
|
---|
1341 | [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
|
---|
1342 | If not set, then character classes are not supported. */
|
---|
1343 | #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
|
---|
1344 |
|
---|
1345 | /* If this bit is set, then ^ and $ are always anchors (outside bracket
|
---|
1346 | expressions, of course).
|
---|
1347 | If this bit is not set, then it depends:
|
---|
1348 | ^ is an anchor if it is at the beginning of a regular
|
---|
1349 | expression or after an open-group or an alternation operator;
|
---|
1350 | $ is an anchor if it is at the end of a regular expression, or
|
---|
1351 | before a close-group or an alternation operator.
|
---|
1352 |
|
---|
1353 | This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
|
---|
1354 | POSIX draft 11.2 says that * etc. in leading positions is undefined.
|
---|
1355 | We already implemented a previous draft which made those constructs
|
---|
1356 | invalid, though, so we haven't changed the code back. */
|
---|
1357 | #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
|
---|
1358 |
|
---|
1359 | /* If this bit is set, then special characters are always special
|
---|
1360 | regardless of where they are in the pattern.
|
---|
1361 | If this bit is not set, then special characters are special only in
|
---|
1362 | some contexts; otherwise they are ordinary. Specifically,
|
---|
1363 | * + ? and intervals are only special when not after the beginning,
|
---|
1364 | open-group, or alternation operator. */
|
---|
1365 | #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
|
---|
1366 |
|
---|
1367 | /* If this bit is set, then *, +, ?, and { cannot be first in an re or
|
---|
1368 | immediately after an alternation or begin-group operator. */
|
---|
1369 | #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
|
---|
1370 |
|
---|
1371 | /* If this bit is set, then . matches newline.
|
---|
1372 | If not set, then it doesn't. */
|
---|
1373 | #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
|
---|
1374 |
|
---|
1375 | /* If this bit is set, then . doesn't match NUL.
|
---|
1376 | If not set, then it does. */
|
---|
1377 | #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
|
---|
1378 |
|
---|
1379 | /* If this bit is set, nonmatching lists [^...] do not match newline.
|
---|
1380 | If not set, they do. */
|
---|
1381 | #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
|
---|
1382 |
|
---|
1383 | /* If this bit is set, either \{...\} or {...} defines an
|
---|
1384 | interval, depending on RE_NO_BK_BRACES.
|
---|
1385 | If not set, \{, \}, {, and } are literals. */
|
---|
1386 | #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
|
---|
1387 |
|
---|
1388 | /* If this bit is set, +, ? and | aren't recognized as operators.
|
---|
1389 | If not set, they are. */
|
---|
1390 | #define RE_LIMITED_OPS (RE_INTERVALS << 1)
|
---|
1391 |
|
---|
1392 | /* If this bit is set, newline is an alternation operator.
|
---|
1393 | If not set, newline is literal. */
|
---|
1394 | #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
|
---|
1395 |
|
---|
1396 | /* If this bit is set, then `{...}' defines an interval, and \{ and \}
|
---|
1397 | are literals.
|
---|
1398 | If not set, then `\{...\}' defines an interval. */
|
---|
1399 | #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
|
---|
1400 |
|
---|
1401 | /* If this bit is set, (...) defines a group, and \( and \) are literals.
|
---|
1402 | If not set, \(...\) defines a group, and ( and ) are literals. */
|
---|
1403 | #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
|
---|
1404 |
|
---|
1405 | /* If this bit is set, then \<digit> matches <digit>.
|
---|
1406 | If not set, then \<digit> is a back-reference. */
|
---|
1407 | #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
|
---|
1408 |
|
---|
1409 | /* If this bit is set, then | is an alternation operator, and \| is literal.
|
---|
1410 | If not set, then \| is an alternation operator, and | is literal. */
|
---|
1411 | #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
|
---|
1412 |
|
---|
1413 | /* If this bit is set, then an ending range point collating higher
|
---|
1414 | than the starting range point, as in [z-a], is invalid.
|
---|
1415 | If not set, then when ending range point collates higher than the
|
---|
1416 | starting range point, the range is ignored. */
|
---|
1417 | #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
|
---|
1418 |
|
---|
1419 | /* If this bit is set, then an unmatched ) is ordinary.
|
---|
1420 | If not set, then an unmatched ) is invalid. */
|
---|
1421 | #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
|
---|
1422 |
|
---|
1423 | /* This global variable defines the particular regexp syntax to use (for
|
---|
1424 | some interfaces). When a regexp is compiled, the syntax used is
|
---|
1425 | stored in the pattern buffer, so changing this does not affect
|
---|
1426 | already-compiled regexps. */
|
---|
1427 | extern reg_syntax_t re_syntax_options;
|
---|
1428 | |
---|
1429 |
|
---|
1430 | /* Define combinations of the above bits for the standard possibilities.
|
---|
1431 | (The [[[ comments delimit what gets put into the Texinfo file, so
|
---|
1432 | don't delete them!) */
|
---|
1433 | /* [[[begin syntaxes]]] */
|
---|
1434 | #define RE_SYNTAX_EMACS 0
|
---|
1435 |
|
---|
1436 | #define RE_SYNTAX_AWK \
|
---|
1437 | (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
|
---|
1438 | | RE_NO_BK_PARENS | RE_NO_BK_REFS \
|
---|
1439 | | RE_NO_BK_VAR | RE_NO_EMPTY_RANGES \
|
---|
1440 | | RE_UNMATCHED_RIGHT_PAREN_ORD)
|
---|
1441 |
|
---|
1442 | #define RE_SYNTAX_POSIX_AWK \
|
---|
1443 | (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
|
---|
1444 |
|
---|
1445 | #define RE_SYNTAX_GREP \
|
---|
1446 | (RE_BK_PLUS_QM | RE_CHAR_CLASSES \
|
---|
1447 | | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \
|
---|
1448 | | RE_NEWLINE_ALT)
|
---|
1449 |
|
---|
1450 | #define RE_SYNTAX_EGREP \
|
---|
1451 | (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \
|
---|
1452 | | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \
|
---|
1453 | | RE_NEWLINE_ALT | RE_NO_BK_PARENS \
|
---|
1454 | | RE_NO_BK_VBAR)
|
---|
1455 |
|
---|
1456 | #define RE_SYNTAX_POSIX_EGREP \
|
---|
1457 | (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
|
---|
1458 |
|
---|
1459 | #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
|
---|
1460 |
|
---|
1461 | /* Syntax bits common to both basic and extended POSIX regex syntax. */
|
---|
1462 | #define _RE_SYNTAX_POSIX_COMMON \
|
---|
1463 | (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
|
---|
1464 | | RE_INTERVALS | RE_NO_EMPTY_RANGES)
|
---|
1465 |
|
---|
1466 | #define RE_SYNTAX_POSIX_BASIC \
|
---|
1467 | (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
|
---|
1468 |
|
---|
1469 | /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
|
---|
1470 | RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
|
---|
1471 | isn't minimal, since other operators, such as \`, aren't disabled. */
|
---|
1472 | #define RE_SYNTAX_POSIX_MINIMAL_BASIC \
|
---|
1473 | (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
|
---|
1474 |
|
---|
1475 | #define RE_SYNTAX_POSIX_EXTENDED \
|
---|
1476 | (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
|
---|
1477 | | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
|
---|
1478 | | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
|
---|
1479 | | RE_UNMATCHED_RIGHT_PAREN_ORD)
|
---|
1480 |
|
---|
1481 | /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
|
---|
1482 | replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
|
---|
1483 | #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
|
---|
1484 | (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
|
---|
1485 | | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
|
---|
1486 | | RE_NO_BK_PARENS | RE_NO_BK_REFS \
|
---|
1487 | | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD)
|
---|
1488 | /* [[[end syntaxes]]] */
|
---|
1489 |
|
---|
1490 | /* Maximum number of duplicates an interval can allow. Some systems
|
---|
1491 | (erroneously) define this in other header files, but we want our
|
---|
1492 | value, so remove any previous define. */
|
---|
1493 | #ifdef RE_DUP_MAX
|
---|
1494 | #undef RE_DUP_MAX
|
---|
1495 | #endif
|
---|
1496 | #define RE_DUP_MAX ((1 << 15) - 1)
|
---|
1497 |
|
---|
1498 |
|
---|
1499 |
|
---|
1500 | /* POSIX `cflags' bits (i.e., information for `regcomp'). */
|
---|
1501 |
|
---|
1502 | /* If this bit is set, then use extended regular expression syntax.
|
---|
1503 | If not set, then use basic regular expression syntax. */
|
---|
1504 | #define REG_EXTENDED 1
|
---|
1505 |
|
---|
1506 | /* If this bit is set, then ignore case when matching.
|
---|
1507 | If not set, then case is significant. */
|
---|
1508 | #define REG_ICASE (REG_EXTENDED << 1)
|
---|
1509 |
|
---|
1510 | /* If this bit is set, then anchors do not match at newline
|
---|
1511 | characters in the string.
|
---|
1512 | If not set, then anchors do match at newlines. */
|
---|
1513 | #define REG_NEWLINE (REG_ICASE << 1)
|
---|
1514 |
|
---|
1515 | /* If this bit is set, then report only success or fail in regexec.
|
---|
1516 | If not set, then returns differ between not matching and errors. */
|
---|
1517 | #define REG_NOSUB (REG_NEWLINE << 1)
|
---|
1518 |
|
---|
1519 |
|
---|
1520 | /* POSIX `eflags' bits (i.e., information for regexec). */
|
---|
1521 |
|
---|
1522 | /* If this bit is set, then the beginning-of-line operator doesn't match
|
---|
1523 | the beginning of the string (presumably because it's not the
|
---|
1524 | beginning of a line).
|
---|
1525 | If not set, then the beginning-of-line operator does match the
|
---|
1526 | beginning of the string. */
|
---|
1527 | #define REG_NOTBOL 1
|
---|
1528 |
|
---|
1529 | /* Like REG_NOTBOL, except for the end-of-line. */
|
---|
1530 | #define REG_NOTEOL (1 << 1)
|
---|
1531 |
|
---|
1532 | /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
|
---|
1533 | * `re_match_2' returns information about at least this many registers
|
---|
1534 | * the first time a `regs' structure is passed.
|
---|
1535 | *
|
---|
1536 | * Also, this is the greatest number of backreferenced subexpressions
|
---|
1537 | * allowed in a pattern being matched without caller-supplied registers.
|
---|
1538 | */
|
---|
1539 | #ifndef RE_NREGS
|
---|
1540 | #define RE_NREGS 30
|
---|
1541 | #endif
|
---|
1542 |
|
---|
1543 | extern int rx_cache_bound;
|
---|
1544 | extern char rx_version_string[];
|
---|
1545 |
|
---|
1546 |
|
---|
1547 | |
---|
1548 |
|
---|
1549 | #ifdef RX_WANT_RX_DEFS
|
---|
1550 |
|
---|
1551 | /* This is decls to the interesting subsystems and lower layers
|
---|
1552 | * of rx. Everything which doesn't have a public counterpart in
|
---|
1553 | * regex.c is declared here.
|
---|
1554 | */
|
---|
1555 |
|
---|
1556 |
|
---|
1557 | #ifdef __STDC__
|
---|
1558 | typedef void (*rx_hash_freefn) (struct rx_hash_item * it);
|
---|
1559 | #else /* ndef __STDC__ */
|
---|
1560 | typedef void (*rx_hash_freefn) ();
|
---|
1561 | #endif /* ndef __STDC__ */
|
---|
1562 |
|
---|
1563 |
|
---|
1564 | |
---|
1565 |
|
---|
1566 |
|
---|
1567 | #ifdef __STDC__
|
---|
1568 | RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b);
|
---|
1569 | RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b);
|
---|
1570 | RX_DECL int rx_bitset_empty (int size, rx_Bitset set);
|
---|
1571 | RX_DECL void rx_bitset_null (int size, rx_Bitset b);
|
---|
1572 | RX_DECL void rx_bitset_universe (int size, rx_Bitset b);
|
---|
1573 | RX_DECL void rx_bitset_complement (int size, rx_Bitset b);
|
---|
1574 | RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b);
|
---|
1575 | RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b);
|
---|
1576 | RX_DECL void rx_bitset_intersection (int size,
|
---|
1577 | rx_Bitset a, rx_Bitset b);
|
---|
1578 | RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b);
|
---|
1579 | RX_DECL void rx_bitset_revdifference (int size,
|
---|
1580 | rx_Bitset a, rx_Bitset b);
|
---|
1581 | RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b);
|
---|
1582 | RX_DECL unsigned long rx_bitset_hash (int size, rx_Bitset b);
|
---|
1583 | RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table,
|
---|
1584 | unsigned long hash,
|
---|
1585 | void * value,
|
---|
1586 | struct rx_hash_rules * rules);
|
---|
1587 | RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table,
|
---|
1588 | unsigned long hash,
|
---|
1589 | void * value,
|
---|
1590 | struct rx_hash_rules * rules);
|
---|
1591 | RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules);
|
---|
1592 | RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
|
---|
1593 | struct rx_hash_rules * rules);
|
---|
1594 | RX_DECL rx_Bitset rx_cset (struct rx *rx);
|
---|
1595 | RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a);
|
---|
1596 | RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c);
|
---|
1597 | RX_DECL struct rexp_node * rexp_node (struct rx *rx,
|
---|
1598 | enum rexp_node_type type);
|
---|
1599 | RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx,
|
---|
1600 | rx_Bitset b);
|
---|
1601 | RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx,
|
---|
1602 | struct rexp_node * a,
|
---|
1603 | struct rexp_node * b);
|
---|
1604 | RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx,
|
---|
1605 | struct rexp_node * a,
|
---|
1606 | struct rexp_node * b);
|
---|
1607 | RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx,
|
---|
1608 | struct rexp_node * a);
|
---|
1609 | RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx,
|
---|
1610 | struct rexp_node * a);
|
---|
1611 | RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx,
|
---|
1612 | struct rexp_node * a,
|
---|
1613 | struct rexp_node * b);
|
---|
1614 | RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx,
|
---|
1615 | rx_side_effect a);
|
---|
1616 | RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx,
|
---|
1617 | void * a);
|
---|
1618 | RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node);
|
---|
1619 | RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx,
|
---|
1620 | struct rexp_node *node);
|
---|
1621 | RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx);
|
---|
1622 | RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n);
|
---|
1623 | RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx,
|
---|
1624 | int id);
|
---|
1625 | RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx,
|
---|
1626 | enum rx_nfa_etype type,
|
---|
1627 | struct rx_nfa_state *start,
|
---|
1628 | struct rx_nfa_state *dest);
|
---|
1629 | RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e);
|
---|
1630 | RX_DECL void rx_free_nfa (struct rx *rx);
|
---|
1631 | RX_DECL int rx_build_nfa (struct rx *rx,
|
---|
1632 | struct rexp_node *rexp,
|
---|
1633 | struct rx_nfa_state **start,
|
---|
1634 | struct rx_nfa_state **end);
|
---|
1635 | RX_DECL void rx_name_nfa_states (struct rx *rx);
|
---|
1636 | RX_DECL int rx_eclose_nfa (struct rx *rx);
|
---|
1637 | RX_DECL void rx_delete_epsilon_transitions (struct rx *rx);
|
---|
1638 | RX_DECL int rx_compactify_nfa (struct rx *rx,
|
---|
1639 | void **mem, unsigned long *size);
|
---|
1640 | RX_DECL void rx_release_superset (struct rx *rx,
|
---|
1641 | struct rx_superset *set);
|
---|
1642 | RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx,
|
---|
1643 | struct rx_nfa_state *car, struct rx_superset *cdr);
|
---|
1644 | RX_DECL struct rx_superset * rx_superstate_eclosure_union
|
---|
1645 | (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl);
|
---|
1646 | RX_DECL struct rx_superstate * rx_superstate (struct rx *rx,
|
---|
1647 | struct rx_superset *set);
|
---|
1648 | RX_DECL struct rx_inx * rx_handle_cache_miss
|
---|
1649 | (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data);
|
---|
1650 | RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size,
|
---|
1651 | reg_syntax_t syntax,
|
---|
1652 | struct re_pattern_buffer * rxb);
|
---|
1653 | RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb);
|
---|
1654 | #else /* STDC */
|
---|
1655 | RX_DECL int rx_bitset_is_equal ();
|
---|
1656 | RX_DECL int rx_bitset_is_subset ();
|
---|
1657 | RX_DECL int rx_bitset_empty ();
|
---|
1658 | RX_DECL void rx_bitset_null ();
|
---|
1659 | RX_DECL void rx_bitset_universe ();
|
---|
1660 | RX_DECL void rx_bitset_complement ();
|
---|
1661 | RX_DECL void rx_bitset_assign ();
|
---|
1662 | RX_DECL void rx_bitset_union ();
|
---|
1663 | RX_DECL void rx_bitset_intersection ();
|
---|
1664 | RX_DECL void rx_bitset_difference ();
|
---|
1665 | RX_DECL void rx_bitset_revdifference ();
|
---|
1666 | RX_DECL void rx_bitset_xor ();
|
---|
1667 | RX_DECL unsigned long rx_bitset_hash ();
|
---|
1668 | RX_DECL struct rx_hash_item * rx_hash_find ();
|
---|
1669 | RX_DECL struct rx_hash_item * rx_hash_store ();
|
---|
1670 | RX_DECL void rx_hash_free ();
|
---|
1671 | RX_DECL void rx_free_hash_table ();
|
---|
1672 | RX_DECL rx_Bitset rx_cset ();
|
---|
1673 | RX_DECL rx_Bitset rx_copy_cset ();
|
---|
1674 | RX_DECL void rx_free_cset ();
|
---|
1675 | RX_DECL struct rexp_node * rexp_node ();
|
---|
1676 | RX_DECL struct rexp_node * rx_mk_r_cset ();
|
---|
1677 | RX_DECL struct rexp_node * rx_mk_r_concat ();
|
---|
1678 | RX_DECL struct rexp_node * rx_mk_r_alternate ();
|
---|
1679 | RX_DECL struct rexp_node * rx_mk_r_opt ();
|
---|
1680 | RX_DECL struct rexp_node * rx_mk_r_star ();
|
---|
1681 | RX_DECL struct rexp_node * rx_mk_r_2phase_star ();
|
---|
1682 | RX_DECL struct rexp_node * rx_mk_r_side_effect ();
|
---|
1683 | RX_DECL struct rexp_node * rx_mk_r_data ();
|
---|
1684 | RX_DECL void rx_free_rexp ();
|
---|
1685 | RX_DECL struct rexp_node * rx_copy_rexp ();
|
---|
1686 | RX_DECL struct rx_nfa_state * rx_nfa_state ();
|
---|
1687 | RX_DECL void rx_free_nfa_state ();
|
---|
1688 | RX_DECL struct rx_nfa_state * rx_id_to_nfa_state ();
|
---|
1689 | RX_DECL struct rx_nfa_edge * rx_nfa_edge ();
|
---|
1690 | RX_DECL void rx_free_nfa_edge ();
|
---|
1691 | RX_DECL void rx_free_nfa ();
|
---|
1692 | RX_DECL int rx_build_nfa ();
|
---|
1693 | RX_DECL void rx_name_nfa_states ();
|
---|
1694 | RX_DECL int rx_eclose_nfa ();
|
---|
1695 | RX_DECL void rx_delete_epsilon_transitions ();
|
---|
1696 | RX_DECL int rx_compactify_nfa ();
|
---|
1697 | RX_DECL void rx_release_superset ();
|
---|
1698 | RX_DECL struct rx_superset * rx_superset_cons ();
|
---|
1699 | RX_DECL struct rx_superset * rx_superstate_eclosure_union ();
|
---|
1700 | RX_DECL struct rx_superstate * rx_superstate ();
|
---|
1701 | RX_DECL struct rx_inx * rx_handle_cache_miss ();
|
---|
1702 | RX_DECL reg_errcode_t rx_compile ();
|
---|
1703 | RX_DECL void rx_blow_up_fastmap ();
|
---|
1704 | #endif /* STDC */
|
---|
1705 |
|
---|
1706 |
|
---|
1707 | #endif /* RX_WANT_RX_DEFS */
|
---|
1708 |
|
---|
1709 |
|
---|
1710 | |
---|
1711 |
|
---|
1712 | #ifdef __STDC__
|
---|
1713 | extern int re_search_2 (struct re_pattern_buffer *rxb,
|
---|
1714 | __const__ char * string1, int size1,
|
---|
1715 | __const__ char * string2, int size2,
|
---|
1716 | int startpos, int range,
|
---|
1717 | struct re_registers *regs,
|
---|
1718 | int stop);
|
---|
1719 | extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string,
|
---|
1720 | int size, int startpos, int range,
|
---|
1721 | struct re_registers *regs);
|
---|
1722 | extern int re_match_2 (struct re_pattern_buffer * rxb,
|
---|
1723 | __const__ char * string1, int size1,
|
---|
1724 | __const__ char * string2, int size2,
|
---|
1725 | int pos, struct re_registers *regs, int stop);
|
---|
1726 | extern int re_match (struct re_pattern_buffer * rxb,
|
---|
1727 | __const__ char * string,
|
---|
1728 | int size, int pos,
|
---|
1729 | struct re_registers *regs);
|
---|
1730 | extern reg_syntax_t re_set_syntax (reg_syntax_t syntax);
|
---|
1731 | extern void re_set_registers (struct re_pattern_buffer *bufp,
|
---|
1732 | struct re_registers *regs,
|
---|
1733 | unsigned num_regs,
|
---|
1734 | regoff_t * starts, regoff_t * ends);
|
---|
1735 | extern __const__ char * re_compile_pattern (__const__ char *pattern,
|
---|
1736 | int length,
|
---|
1737 | struct re_pattern_buffer * rxb);
|
---|
1738 | extern int re_compile_fastmap (struct re_pattern_buffer * rxb);
|
---|
1739 | extern char * re_comp (__const__ char *s);
|
---|
1740 | extern int re_exec (__const__ char *s);
|
---|
1741 | extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags);
|
---|
1742 | extern int regexec (__const__ regex_t *preg, __const__ char *string,
|
---|
1743 | size_t nmatch, regmatch_t pmatch[],
|
---|
1744 | int eflags);
|
---|
1745 | extern size_t regerror (int errcode, __const__ regex_t *preg,
|
---|
1746 | char *errbuf, size_t errbuf_size);
|
---|
1747 | extern void regfree (regex_t *preg);
|
---|
1748 |
|
---|
1749 | #else /* STDC */
|
---|
1750 | extern int re_search_2 ();
|
---|
1751 | extern int re_search ();
|
---|
1752 | extern int re_match_2 ();
|
---|
1753 | extern int re_match ();
|
---|
1754 | extern reg_syntax_t re_set_syntax ();
|
---|
1755 | extern void re_set_registers ();
|
---|
1756 | extern __const__ char * re_compile_pattern ();
|
---|
1757 | extern int re_compile_fastmap ();
|
---|
1758 | extern char * re_comp ();
|
---|
1759 | extern int re_exec ();
|
---|
1760 | extern int regcomp ();
|
---|
1761 | extern int regexec ();
|
---|
1762 | extern size_t regerror ();
|
---|
1763 | extern void regfree ();
|
---|
1764 |
|
---|
1765 | #endif /* STDC */
|
---|
1766 |
|
---|
1767 | |
---|
1768 |
|
---|
1769 |
|
---|
1770 | #ifdef RX_WANT_RX_DEFS
|
---|
1771 |
|
---|
1772 | struct rx_counter_frame
|
---|
1773 | {
|
---|
1774 | int tag;
|
---|
1775 | int val;
|
---|
1776 | struct rx_counter_frame * inherited_from; /* If this is a copy. */
|
---|
1777 | struct rx_counter_frame * cdr;
|
---|
1778 | };
|
---|
1779 |
|
---|
1780 | struct rx_backtrack_frame
|
---|
1781 | {
|
---|
1782 | char * counter_stack_sp;
|
---|
1783 |
|
---|
1784 | /* A frame is used to save the matchers state when it crosses a
|
---|
1785 | * backtracking point. The `stk_' fields correspond to variables
|
---|
1786 | * in re_search_2 (just strip off thes `stk_'). They are documented
|
---|
1787 | * tere.
|
---|
1788 | */
|
---|
1789 | struct rx_superstate * stk_super;
|
---|
1790 | unsigned int stk_c;
|
---|
1791 | struct rx_string_position stk_test_pos;
|
---|
1792 | int stk_last_l;
|
---|
1793 | int stk_last_r;
|
---|
1794 | int stk_test_ret;
|
---|
1795 |
|
---|
1796 | /* This is the list of options left to explore at the backtrack
|
---|
1797 | * point for which this frame was created.
|
---|
1798 | */
|
---|
1799 | struct rx_distinct_future * df;
|
---|
1800 | struct rx_distinct_future * first_df;
|
---|
1801 |
|
---|
1802 | #ifdef RX_DEBUG
|
---|
1803 | int stk_line_no;
|
---|
1804 | #endif
|
---|
1805 | };
|
---|
1806 |
|
---|
1807 | struct rx_stack_chunk
|
---|
1808 | {
|
---|
1809 | struct rx_stack_chunk * next_chunk;
|
---|
1810 | int bytes_left;
|
---|
1811 | char * sp;
|
---|
1812 | };
|
---|
1813 |
|
---|
1814 | enum rx_outer_entry
|
---|
1815 | {
|
---|
1816 | rx_outer_start,
|
---|
1817 | rx_outer_fastmap,
|
---|
1818 | rx_outer_test,
|
---|
1819 | rx_outer_restore_pos
|
---|
1820 | };
|
---|
1821 |
|
---|
1822 | enum rx_fastmap_return
|
---|
1823 | {
|
---|
1824 | rx_fastmap_continuation,
|
---|
1825 | rx_fastmap_error,
|
---|
1826 | rx_fastmap_ok,
|
---|
1827 | rx_fastmap_fail
|
---|
1828 | };
|
---|
1829 |
|
---|
1830 | enum rx_fastmap_entry
|
---|
1831 | {
|
---|
1832 | rx_fastmap_start,
|
---|
1833 | rx_fastmap_string_break
|
---|
1834 | };
|
---|
1835 |
|
---|
1836 | enum rx_test_return
|
---|
1837 | {
|
---|
1838 | rx_test_continuation,
|
---|
1839 | rx_test_error,
|
---|
1840 | rx_test_fail,
|
---|
1841 | rx_test_ok
|
---|
1842 | };
|
---|
1843 |
|
---|
1844 | enum rx_test_internal_return
|
---|
1845 | {
|
---|
1846 | rx_test_internal_error,
|
---|
1847 | rx_test_found_first,
|
---|
1848 | rx_test_line_finished
|
---|
1849 | };
|
---|
1850 |
|
---|
1851 | enum rx_test_match_entry
|
---|
1852 | {
|
---|
1853 | rx_test_start,
|
---|
1854 | rx_test_cache_hit_loop,
|
---|
1855 | rx_test_backreference_check,
|
---|
1856 | rx_test_backtrack_return
|
---|
1857 | };
|
---|
1858 |
|
---|
1859 | struct rx_search_state
|
---|
1860 | {
|
---|
1861 | /* Two groups of registers are kept. The group with the register state
|
---|
1862 | * of the current test match, and the group that holds the state at the end
|
---|
1863 | * of the best known match, if any.
|
---|
1864 | *
|
---|
1865 | * For some patterns, there may also be registers saved on the stack.
|
---|
1866 | */
|
---|
1867 | unsigned num_regs; /* Includes an element for register zero. */
|
---|
1868 | regoff_t * lparen; /* scratch space for register returns */
|
---|
1869 | regoff_t * rparen;
|
---|
1870 | regoff_t * best_lpspace; /* in case the user doesn't want these */
|
---|
1871 | regoff_t * best_rpspace; /* values, we still need space to store
|
---|
1872 | * them. Normally, this memoryis unused
|
---|
1873 | * and the space pointed to by REGS is
|
---|
1874 | * used instead.
|
---|
1875 | */
|
---|
1876 |
|
---|
1877 | int last_l; /* Highest index of a valid lparen. */
|
---|
1878 | int last_r; /* It's dual. */
|
---|
1879 |
|
---|
1880 | int * best_lparen; /* This contains the best known register */
|
---|
1881 | int * best_rparen; /* assignments.
|
---|
1882 | * This may point to the same mem as
|
---|
1883 | * best_lpspace, or it might point to memory
|
---|
1884 | * passed by the caller.
|
---|
1885 | */
|
---|
1886 | int best_last_l; /* best_last_l:best_lparen::last_l:lparen */
|
---|
1887 | int best_last_r;
|
---|
1888 |
|
---|
1889 |
|
---|
1890 | unsigned char * translate;
|
---|
1891 |
|
---|
1892 | struct rx_string_position outer_pos;
|
---|
1893 |
|
---|
1894 | struct rx_superstate * start_super;
|
---|
1895 | int nfa_choice;
|
---|
1896 | int first_found; /* If true, return after finding any match. */
|
---|
1897 | int ret_val;
|
---|
1898 |
|
---|
1899 | /* For continuations... */
|
---|
1900 | enum rx_outer_entry outer_search_resume_pt;
|
---|
1901 | struct re_pattern_buffer * saved_rxb;
|
---|
1902 | int saved_startpos;
|
---|
1903 | int saved_range;
|
---|
1904 | int saved_stop;
|
---|
1905 | int saved_total_size;
|
---|
1906 | rx_get_burst_fn saved_get_burst;
|
---|
1907 | rx_back_check_fn saved_back_check;
|
---|
1908 | struct re_registers * saved_regs;
|
---|
1909 |
|
---|
1910 | /**
|
---|
1911 | ** state for fastmap
|
---|
1912 | **/
|
---|
1913 | char * fastmap;
|
---|
1914 | int fastmap_chr;
|
---|
1915 | int fastmap_val;
|
---|
1916 |
|
---|
1917 | /* for continuations in the fastmap procedure: */
|
---|
1918 | enum rx_fastmap_entry fastmap_resume_pt;
|
---|
1919 |
|
---|
1920 | /**
|
---|
1921 | ** state for test_match
|
---|
1922 | **/
|
---|
1923 |
|
---|
1924 | /* The current superNFA position of the matcher. */
|
---|
1925 | struct rx_superstate * super;
|
---|
1926 |
|
---|
1927 | /* The matcher interprets a series of instruction frames.
|
---|
1928 | * This is the `instruction counter' for the interpretation.
|
---|
1929 | */
|
---|
1930 | struct rx_inx * ifr;
|
---|
1931 |
|
---|
1932 | /* We insert a ghost character in the string to prime
|
---|
1933 | * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half
|
---|
1934 | * keep track of the test-match position and string-half.
|
---|
1935 | */
|
---|
1936 | unsigned char c;
|
---|
1937 |
|
---|
1938 | /* Position within the string. */
|
---|
1939 | struct rx_string_position test_pos;
|
---|
1940 |
|
---|
1941 | struct rx_stack_chunk * counter_stack;
|
---|
1942 | struct rx_stack_chunk * backtrack_stack;
|
---|
1943 | int backtrack_frame_bytes;
|
---|
1944 | int chunk_bytes;
|
---|
1945 | struct rx_stack_chunk * free_chunks;
|
---|
1946 |
|
---|
1947 | /* To return from this function, set test_ret and
|
---|
1948 | * `goto test_do_return'.
|
---|
1949 | *
|
---|
1950 | * Possible return values are:
|
---|
1951 | * 1 --- end of string while the superNFA is still going
|
---|
1952 | * 0 --- internal error (out of memory)
|
---|
1953 | * -1 --- search completed by reaching the superNFA fail state
|
---|
1954 | * -2 --- a match was found, maybe not the longest.
|
---|
1955 | *
|
---|
1956 | * When the search is complete (-1), best_last_r indicates whether
|
---|
1957 | * a match was found.
|
---|
1958 | *
|
---|
1959 | * -2 is return only if search_state.first_found is non-zero.
|
---|
1960 | *
|
---|
1961 | * if search_state.first_found is non-zero, a return of -1 indicates no match,
|
---|
1962 | * otherwise, best_last_r has to be checked.
|
---|
1963 | */
|
---|
1964 | int test_ret;
|
---|
1965 |
|
---|
1966 | int could_have_continued;
|
---|
1967 |
|
---|
1968 | #ifdef RX_DEBUG
|
---|
1969 | int backtrack_depth;
|
---|
1970 | /* There is a search tree with every node as set of deterministic
|
---|
1971 | * transitions in the super nfa. For every branch of a
|
---|
1972 | * backtrack point is an edge in the tree.
|
---|
1973 | * This counts up a pre-order of nodes in that tree.
|
---|
1974 | * It's saved on the search stack and printed when debugging.
|
---|
1975 | */
|
---|
1976 | int line_no;
|
---|
1977 | int lines_found;
|
---|
1978 | #endif
|
---|
1979 |
|
---|
1980 |
|
---|
1981 | /* For continuations within the match tester */
|
---|
1982 | enum rx_test_match_entry test_match_resume_pt;
|
---|
1983 | struct rx_inx * saved_next_tr_table;
|
---|
1984 | struct rx_inx * saved_this_tr_table;
|
---|
1985 | int saved_reg;
|
---|
1986 | struct rx_backtrack_frame * saved_bf;
|
---|
1987 |
|
---|
1988 | };
|
---|
1989 |
|
---|
1990 | |
---|
1991 |
|
---|
1992 | extern char rx_slowmap[];
|
---|
1993 | extern unsigned char rx_id_translation[];
|
---|
1994 |
|
---|
1995 | static __inline__ void
|
---|
1996 | init_fastmap (rxb, search_state)
|
---|
1997 | struct re_pattern_buffer * rxb;
|
---|
1998 | struct rx_search_state * search_state;
|
---|
1999 | {
|
---|
2000 | search_state->fastmap = (rxb->fastmap
|
---|
2001 | ? (char *)rxb->fastmap
|
---|
2002 | : (char *)rx_slowmap);
|
---|
2003 | /* Update the fastmap now if not correct already.
|
---|
2004 | * When the regexp was compiled, the fastmap was computed
|
---|
2005 | * and stored in a bitset. This expands the bitset into a
|
---|
2006 | * character array containing 1s and 0s.
|
---|
2007 | */
|
---|
2008 | if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate)
|
---|
2009 | rx_blow_up_fastmap (rxb);
|
---|
2010 | search_state->fastmap_chr = -1;
|
---|
2011 | search_state->fastmap_val = 0;
|
---|
2012 | search_state->fastmap_resume_pt = rx_fastmap_start;
|
---|
2013 | }
|
---|
2014 |
|
---|
2015 | static __inline__ void
|
---|
2016 | uninit_fastmap (rxb, search_state)
|
---|
2017 | struct re_pattern_buffer * rxb;
|
---|
2018 | struct rx_search_state * search_state;
|
---|
2019 | {
|
---|
2020 | /* Unset the fastmap sentinel */
|
---|
2021 | if (search_state->fastmap_chr >= 0)
|
---|
2022 | search_state->fastmap[search_state->fastmap_chr]
|
---|
2023 | = search_state->fastmap_val;
|
---|
2024 | }
|
---|
2025 |
|
---|
2026 | static __inline__ int
|
---|
2027 | fastmap_search (rxb, stop, get_burst, app_closure, search_state)
|
---|
2028 | struct re_pattern_buffer * rxb;
|
---|
2029 | int stop;
|
---|
2030 | rx_get_burst_fn get_burst;
|
---|
2031 | void * app_closure;
|
---|
2032 | struct rx_search_state * search_state;
|
---|
2033 | {
|
---|
2034 | enum rx_fastmap_entry pc;
|
---|
2035 |
|
---|
2036 | if (0)
|
---|
2037 | {
|
---|
2038 | return_continuation:
|
---|
2039 | search_state->fastmap_resume_pt = pc;
|
---|
2040 | return rx_fastmap_continuation;
|
---|
2041 | }
|
---|
2042 |
|
---|
2043 | pc = search_state->fastmap_resume_pt;
|
---|
2044 |
|
---|
2045 | switch (pc)
|
---|
2046 | {
|
---|
2047 | default:
|
---|
2048 | return rx_fastmap_error;
|
---|
2049 | case rx_fastmap_start:
|
---|
2050 | init_fastmap_sentinal:
|
---|
2051 | /* For the sake of fast fastmapping, set a sentinal in the fastmap.
|
---|
2052 | * This sentinal will trap the fastmap loop when it reaches the last
|
---|
2053 | * valid character in a string half.
|
---|
2054 | *
|
---|
2055 | * This must be reset when the fastmap/search loop crosses a string
|
---|
2056 | * boundry, and before returning to the caller. So sometimes,
|
---|
2057 | * the fastmap loop is restarted with `continue', othertimes by
|
---|
2058 | * `goto init_fastmap_sentinal'.
|
---|
2059 | */
|
---|
2060 | if (search_state->outer_pos.size)
|
---|
2061 | {
|
---|
2062 | search_state->fastmap_chr = ((search_state->outer_pos.search_direction == 1)
|
---|
2063 | ? *(search_state->outer_pos.end - 1)
|
---|
2064 | : *search_state->outer_pos.string);
|
---|
2065 | search_state->fastmap_val
|
---|
2066 | = search_state->fastmap[search_state->fastmap_chr];
|
---|
2067 | search_state->fastmap[search_state->fastmap_chr] = 1;
|
---|
2068 | }
|
---|
2069 | else
|
---|
2070 | {
|
---|
2071 | search_state->fastmap_chr = -1;
|
---|
2072 | search_state->fastmap_val = 0;
|
---|
2073 | }
|
---|
2074 |
|
---|
2075 | if (search_state->outer_pos.pos >= search_state->outer_pos.end)
|
---|
2076 | goto fastmap_hit_bound;
|
---|
2077 | else
|
---|
2078 | {
|
---|
2079 | if (search_state->outer_pos.search_direction == 1)
|
---|
2080 | {
|
---|
2081 | if (search_state->fastmap_val)
|
---|
2082 | {
|
---|
2083 | for (;;)
|
---|
2084 | {
|
---|
2085 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
2086 | ++search_state->outer_pos.pos;
|
---|
2087 | return rx_fastmap_ok;
|
---|
2088 | }
|
---|
2089 | }
|
---|
2090 | else
|
---|
2091 | {
|
---|
2092 | for (;;)
|
---|
2093 | {
|
---|
2094 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
2095 | ++search_state->outer_pos.pos;
|
---|
2096 | if (*search_state->outer_pos.pos != search_state->fastmap_chr)
|
---|
2097 | return rx_fastmap_ok;
|
---|
2098 | else
|
---|
2099 | {
|
---|
2100 | ++search_state->outer_pos.pos;
|
---|
2101 | if (search_state->outer_pos.pos == search_state->outer_pos.end)
|
---|
2102 | goto fastmap_hit_bound;
|
---|
2103 | }
|
---|
2104 | }
|
---|
2105 | }
|
---|
2106 | }
|
---|
2107 | else
|
---|
2108 | {
|
---|
2109 | __const__ unsigned char * bound;
|
---|
2110 | bound = search_state->outer_pos.string - 1;
|
---|
2111 | if (search_state->fastmap_val)
|
---|
2112 | {
|
---|
2113 | for (;;)
|
---|
2114 | {
|
---|
2115 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
2116 | --search_state->outer_pos.pos;
|
---|
2117 | return rx_fastmap_ok;
|
---|
2118 | }
|
---|
2119 | }
|
---|
2120 | else
|
---|
2121 | {
|
---|
2122 | for (;;)
|
---|
2123 | {
|
---|
2124 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
2125 | --search_state->outer_pos.pos;
|
---|
2126 | if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val)
|
---|
2127 | return rx_fastmap_ok;
|
---|
2128 | else
|
---|
2129 | {
|
---|
2130 | --search_state->outer_pos.pos;
|
---|
2131 | if (search_state->outer_pos.pos == bound)
|
---|
2132 | goto fastmap_hit_bound;
|
---|
2133 | }
|
---|
2134 | }
|
---|
2135 | }
|
---|
2136 | }
|
---|
2137 | }
|
---|
2138 |
|
---|
2139 | case rx_fastmap_string_break:
|
---|
2140 | fastmap_hit_bound:
|
---|
2141 | {
|
---|
2142 | /* If we hit a bound, it may be time to fetch another burst
|
---|
2143 | * of string, or it may be time to return a continuation to
|
---|
2144 | * the caller, or it might be time to fail.
|
---|
2145 | */
|
---|
2146 |
|
---|
2147 | int burst_state;
|
---|
2148 | burst_state = get_burst (&search_state->outer_pos, app_closure, stop);
|
---|
2149 | switch (burst_state)
|
---|
2150 | {
|
---|
2151 | default:
|
---|
2152 | case rx_get_burst_error:
|
---|
2153 | return rx_fastmap_error;
|
---|
2154 | case rx_get_burst_continuation:
|
---|
2155 | {
|
---|
2156 | pc = rx_fastmap_string_break;
|
---|
2157 | goto return_continuation;
|
---|
2158 | }
|
---|
2159 | case rx_get_burst_ok:
|
---|
2160 | goto init_fastmap_sentinal;
|
---|
2161 | case rx_get_burst_no_more:
|
---|
2162 | /* ...not a string split, simply no more string.
|
---|
2163 | *
|
---|
2164 | * When searching backward, running out of string
|
---|
2165 | * is reason to quit.
|
---|
2166 | *
|
---|
2167 | * When searching forward, we allow the possibility
|
---|
2168 | * of an (empty) match after the last character in the
|
---|
2169 | * virtual string. So, fall through to the matcher
|
---|
2170 | */
|
---|
2171 | return ( (search_state->outer_pos.search_direction == 1)
|
---|
2172 | ? rx_fastmap_ok
|
---|
2173 | : rx_fastmap_fail);
|
---|
2174 | }
|
---|
2175 | }
|
---|
2176 | }
|
---|
2177 |
|
---|
2178 | }
|
---|
2179 |
|
---|
2180 | |
---|
2181 |
|
---|
2182 |
|
---|
2183 | #ifdef emacs
|
---|
2184 | /* The `emacs' switch turns on certain matching commands
|
---|
2185 | * that make sense only in Emacs.
|
---|
2186 | */
|
---|
2187 | #include "sysfuncs.h"
|
---|
2188 | #include "lisp.h"
|
---|
2189 | #include "buffer.h"
|
---|
2190 | #include "syntax.h"
|
---|
2191 | #endif /* emacs */
|
---|
2192 |
|
---|
2193 | /* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar
|
---|
2194 | * packages too.
|
---|
2195 | */
|
---|
2196 | #ifdef RX_MEMDBUG
|
---|
2197 | #include <malloc.h>
|
---|
2198 | #endif /* RX_RX_MEMDBUG */
|
---|
2199 |
|
---|
2200 | /* We used to test for `BSTRING' here, but only GCC and Emacs define
|
---|
2201 | * `BSTRING', as far as I know, and neither of them use this code.
|
---|
2202 | */
|
---|
2203 | #if HAVE_STRING_H || STDC_HEADERS
|
---|
2204 | #include <string.h>
|
---|
2205 |
|
---|
2206 | #ifndef bcmp
|
---|
2207 | #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
|
---|
2208 | #endif
|
---|
2209 |
|
---|
2210 | #ifndef bcopy
|
---|
2211 | #define bcopy(s, d, n) memcpy ((d), (s), (n))
|
---|
2212 | #endif
|
---|
2213 |
|
---|
2214 | #ifndef bzero
|
---|
2215 | #define bzero(s, n) memset ((s), 0, (n))
|
---|
2216 | #endif
|
---|
2217 |
|
---|
2218 | #else /* HAVE_STRING_H || STDC_HEADERS */
|
---|
2219 | #include <strings.h>
|
---|
2220 | #endif /* not (HAVE_STRING_H || STDC_HEADERS) */
|
---|
2221 |
|
---|
2222 | #ifdef STDC_HEADERS
|
---|
2223 | #include <stdlib.h>
|
---|
2224 | #else /* not STDC_HEADERS */
|
---|
2225 | char *malloc ();
|
---|
2226 | char *realloc ();
|
---|
2227 | #endif /* not STDC_HEADERS */
|
---|
2228 |
|
---|
2229 |
|
---|
2230 | |
---|
2231 |
|
---|
2232 |
|
---|
2233 | /* How many characters in the character set. */
|
---|
2234 | #define CHAR_SET_SIZE (1 << CHARBITS)
|
---|
2235 |
|
---|
2236 | #ifndef emacs
|
---|
2237 | /* Define the syntax basics for \<, \>, etc.
|
---|
2238 | * This must be nonzero for the wordchar and notwordchar pattern
|
---|
2239 | * commands in re_match_2.
|
---|
2240 | */
|
---|
2241 | #ifndef Sword
|
---|
2242 | #define Sword 1
|
---|
2243 | #endif
|
---|
2244 | #define SYNTAX(c) re_syntax_table[c]
|
---|
2245 | #ifdef SYNTAX_TABLE
|
---|
2246 | extern char *re_syntax_table;
|
---|
2247 | #else
|
---|
2248 | RX_DECL char re_syntax_table[CHAR_SET_SIZE];
|
---|
2249 | #endif
|
---|
2250 | #endif /* not emacs */
|
---|
2251 |
|
---|
2252 |
|
---|
2253 | /* Test if at very beginning or at very end of the virtual concatenation
|
---|
2254 | * of `string1' and `string2'. If only one string, it's `string2'.
|
---|
2255 | */
|
---|
2256 |
|
---|
2257 | #define AT_STRINGS_BEG() \
|
---|
2258 | ( -1 \
|
---|
2259 | == ((search_state.test_pos.pos - search_state.test_pos.string) \
|
---|
2260 | + search_state.test_pos.offset))
|
---|
2261 |
|
---|
2262 | #define AT_STRINGS_END() \
|
---|
2263 | ( (total_size - 1) \
|
---|
2264 | == ((search_state.test_pos.pos - search_state.test_pos.string) \
|
---|
2265 | + search_state.test_pos.offset))
|
---|
2266 |
|
---|
2267 |
|
---|
2268 | /* Test if POS + 1 points to a character which is word-constituent. We have
|
---|
2269 | * two special cases to check for: if past the end of string1, look at
|
---|
2270 | * the first character in string2; and if before the beginning of
|
---|
2271 | * string2, look at the last character in string1.
|
---|
2272 | *
|
---|
2273 | * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG ().
|
---|
2274 | */
|
---|
2275 | #define LETTER_P(POS,OFF) \
|
---|
2276 | ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \
|
---|
2277 | == Sword)
|
---|
2278 |
|
---|
2279 | /* Test if the character at D and the one after D differ with respect
|
---|
2280 | * to being word-constituent.
|
---|
2281 | */
|
---|
2282 | #define AT_WORD_BOUNDARY(d) \
|
---|
2283 | (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1))
|
---|
2284 |
|
---|
2285 |
|
---|
2286 | #ifdef RX_SUPPORT_CONTINUATIONS
|
---|
2287 | #define RX_STACK_ALLOC(BYTES) malloc(BYTES)
|
---|
2288 | #define RX_STACK_FREE(MEM) free(MEM)
|
---|
2289 | #else
|
---|
2290 | #define RX_STACK_ALLOC(BYTES) alloca(BYTES)
|
---|
2291 | #define RX_STACK_FREE(MEM) \
|
---|
2292 | ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \
|
---|
2293 | search_state.free_chunks = ((struct rx_stack_chunk *)MEM);
|
---|
2294 |
|
---|
2295 | #endif
|
---|
2296 |
|
---|
2297 | #define PUSH(CHUNK_VAR,BYTES) \
|
---|
2298 | if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \
|
---|
2299 | { \
|
---|
2300 | struct rx_stack_chunk * new_chunk; \
|
---|
2301 | if (search_state.free_chunks) \
|
---|
2302 | { \
|
---|
2303 | new_chunk = search_state.free_chunks; \
|
---|
2304 | search_state.free_chunks = search_state.free_chunks->next_chunk; \
|
---|
2305 | } \
|
---|
2306 | else \
|
---|
2307 | { \
|
---|
2308 | new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \
|
---|
2309 | if (!new_chunk) \
|
---|
2310 | { \
|
---|
2311 | search_state.ret_val = 0; \
|
---|
2312 | goto test_do_return; \
|
---|
2313 | } \
|
---|
2314 | } \
|
---|
2315 | new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \
|
---|
2316 | new_chunk->bytes_left = (search_state.chunk_bytes \
|
---|
2317 | - (BYTES) \
|
---|
2318 | - sizeof (struct rx_stack_chunk)); \
|
---|
2319 | new_chunk->next_chunk = CHUNK_VAR; \
|
---|
2320 | CHUNK_VAR = new_chunk; \
|
---|
2321 | } \
|
---|
2322 | else \
|
---|
2323 | (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES))
|
---|
2324 |
|
---|
2325 | #define POP(CHUNK_VAR,BYTES) \
|
---|
2326 | if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \
|
---|
2327 | { \
|
---|
2328 | struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \
|
---|
2329 | RX_STACK_FREE(CHUNK_VAR); \
|
---|
2330 | CHUNK_VAR = new_chunk; \
|
---|
2331 | } \
|
---|
2332 | else \
|
---|
2333 | (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES)
|
---|
2334 |
|
---|
2335 |
|
---|
2336 |
|
---|
2337 | #define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)]
|
---|
2338 |
|
---|
2339 |
|
---|
2340 | |
---|
2341 |
|
---|
2342 |
|
---|
2343 | #ifdef __STDC__
|
---|
2344 | RX_DECL __inline__ int
|
---|
2345 | rx_search (struct re_pattern_buffer * rxb,
|
---|
2346 | int startpos,
|
---|
2347 | int range,
|
---|
2348 | int stop,
|
---|
2349 | int total_size,
|
---|
2350 | rx_get_burst_fn get_burst,
|
---|
2351 | rx_back_check_fn back_check,
|
---|
2352 | rx_fetch_char_fn fetch_char,
|
---|
2353 | void * app_closure,
|
---|
2354 | struct re_registers * regs,
|
---|
2355 | struct rx_search_state * resume_state,
|
---|
2356 | struct rx_search_state * save_state)
|
---|
2357 | #else
|
---|
2358 | RX_DECL __inline__ int
|
---|
2359 | rx_search (rxb, startpos, range, stop, total_size,
|
---|
2360 | get_burst, back_check, fetch_char,
|
---|
2361 | app_closure, regs, resume_state, save_state)
|
---|
2362 | struct re_pattern_buffer * rxb;
|
---|
2363 | int startpos;
|
---|
2364 | int range;
|
---|
2365 | int stop;
|
---|
2366 | int total_size;
|
---|
2367 | rx_get_burst_fn get_burst;
|
---|
2368 | rx_back_check_fn back_check;
|
---|
2369 | rx_fetch_char_fn fetch_char;
|
---|
2370 | void * app_closure;
|
---|
2371 | struct re_registers * regs;
|
---|
2372 | struct rx_search_state * resume_state;
|
---|
2373 | struct rx_search_state * save_state;
|
---|
2374 | #endif
|
---|
2375 | {
|
---|
2376 | int pc;
|
---|
2377 | int test_state;
|
---|
2378 | struct rx_search_state search_state;
|
---|
2379 |
|
---|
2380 | search_state.free_chunks = 0;
|
---|
2381 | if (!resume_state)
|
---|
2382 | pc = rx_outer_start;
|
---|
2383 | else
|
---|
2384 | {
|
---|
2385 | search_state = *resume_state;
|
---|
2386 | regs = search_state.saved_regs;
|
---|
2387 | rxb = search_state.saved_rxb;
|
---|
2388 | startpos = search_state.saved_startpos;
|
---|
2389 | range = search_state.saved_range;
|
---|
2390 | stop = search_state.saved_stop;
|
---|
2391 | total_size = search_state.saved_total_size;
|
---|
2392 | get_burst = search_state.saved_get_burst;
|
---|
2393 | back_check = search_state.saved_back_check;
|
---|
2394 | pc = search_state.outer_search_resume_pt;
|
---|
2395 | if (0)
|
---|
2396 | {
|
---|
2397 | return_continuation:
|
---|
2398 | if (save_state)
|
---|
2399 | {
|
---|
2400 | *save_state = search_state;
|
---|
2401 | save_state->saved_regs = regs;
|
---|
2402 | save_state->saved_rxb = rxb;
|
---|
2403 | save_state->saved_startpos = startpos;
|
---|
2404 | save_state->saved_range = range;
|
---|
2405 | save_state->saved_stop = stop;
|
---|
2406 | save_state->saved_total_size = total_size;
|
---|
2407 | save_state->saved_get_burst = get_burst;
|
---|
2408 | save_state->saved_back_check = back_check;
|
---|
2409 | save_state->outer_search_resume_pt = pc;
|
---|
2410 | }
|
---|
2411 | return rx_search_continuation;
|
---|
2412 | }
|
---|
2413 | }
|
---|
2414 |
|
---|
2415 | switch (pc)
|
---|
2416 | {
|
---|
2417 | case rx_outer_start:
|
---|
2418 | search_state.ret_val = rx_search_fail;
|
---|
2419 | ( search_state.lparen
|
---|
2420 | = search_state.rparen
|
---|
2421 | = search_state.best_lpspace
|
---|
2422 | = search_state.best_rpspace
|
---|
2423 | = 0);
|
---|
2424 |
|
---|
2425 | /* figure the number of registers we may need for use in backreferences.
|
---|
2426 | * the number here includes an element for register zero.
|
---|
2427 | */
|
---|
2428 | search_state.num_regs = rxb->re_nsub + 1;
|
---|
2429 |
|
---|
2430 |
|
---|
2431 | /* check for out-of-range startpos. */
|
---|
2432 | if ((startpos < 0) || (startpos > total_size))
|
---|
2433 | return rx_search_fail;
|
---|
2434 |
|
---|
2435 | /* fix up range if it might eventually take us outside the string. */
|
---|
2436 | {
|
---|
2437 | int endpos;
|
---|
2438 | endpos = startpos + range;
|
---|
2439 | if (endpos < -1)
|
---|
2440 | range = (-1 - startpos);
|
---|
2441 | else if (endpos > (total_size + 1))
|
---|
2442 | range = total_size - startpos;
|
---|
2443 | }
|
---|
2444 |
|
---|
2445 | /* if the search isn't to be a backwards one, don't waste time in a
|
---|
2446 | * long search for a pattern that says it is anchored.
|
---|
2447 | */
|
---|
2448 | if (rxb->begbuf_only && (range > 0))
|
---|
2449 | {
|
---|
2450 | if (startpos > 0)
|
---|
2451 | return rx_search_fail;
|
---|
2452 | else
|
---|
2453 | range = 1;
|
---|
2454 | }
|
---|
2455 |
|
---|
2456 | /* decide whether to use internal or user-provided reg buffers. */
|
---|
2457 | if (!regs || rxb->no_sub)
|
---|
2458 | {
|
---|
2459 | search_state.best_lpspace =
|
---|
2460 | (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
2461 | search_state.best_rpspace =
|
---|
2462 | (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
2463 | search_state.best_lparen = search_state.best_lpspace;
|
---|
2464 | search_state.best_rparen = search_state.best_rpspace;
|
---|
2465 | }
|
---|
2466 | else
|
---|
2467 | {
|
---|
2468 | /* have the register data arrays been allocated? */
|
---|
2469 | if (rxb->regs_allocated == REGS_UNALLOCATED)
|
---|
2470 | { /* no. so allocate them with malloc. we need one
|
---|
2471 | extra element beyond `search_state.num_regs' for the `-1' marker
|
---|
2472 | gnu code uses. */
|
---|
2473 | regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1);
|
---|
2474 | regs->start = ((regoff_t *)
|
---|
2475 | malloc (regs->num_regs * sizeof ( regoff_t)));
|
---|
2476 | regs->end = ((regoff_t *)
|
---|
2477 | malloc (regs->num_regs * sizeof ( regoff_t)));
|
---|
2478 | if (regs->start == 0 || regs->end == 0)
|
---|
2479 | return rx_search_error;
|
---|
2480 | rxb->regs_allocated = REGS_REALLOCATE;
|
---|
2481 | }
|
---|
2482 | else if (rxb->regs_allocated == REGS_REALLOCATE)
|
---|
2483 | { /* yes. if we need more elements than were already
|
---|
2484 | allocated, reallocate them. if we need fewer, just
|
---|
2485 | leave it alone. */
|
---|
2486 | if (regs->num_regs < search_state.num_regs + 1)
|
---|
2487 | {
|
---|
2488 | regs->num_regs = search_state.num_regs + 1;
|
---|
2489 | regs->start = ((regoff_t *)
|
---|
2490 | realloc (regs->start,
|
---|
2491 | regs->num_regs * sizeof (regoff_t)));
|
---|
2492 | regs->end = ((regoff_t *)
|
---|
2493 | realloc (regs->end,
|
---|
2494 | regs->num_regs * sizeof ( regoff_t)));
|
---|
2495 | if (regs->start == 0 || regs->end == 0)
|
---|
2496 | return rx_search_error;
|
---|
2497 | }
|
---|
2498 | }
|
---|
2499 | else if (rxb->regs_allocated != REGS_FIXED)
|
---|
2500 | return rx_search_error;
|
---|
2501 |
|
---|
2502 | if (regs->num_regs < search_state.num_regs + 1)
|
---|
2503 | {
|
---|
2504 | search_state.best_lpspace =
|
---|
2505 | ((regoff_t *)
|
---|
2506 | REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
|
---|
2507 | search_state.best_rpspace =
|
---|
2508 | ((regoff_t *)
|
---|
2509 | REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
|
---|
2510 | search_state.best_lparen = search_state.best_lpspace;
|
---|
2511 | search_state.best_rparen = search_state.best_rpspace;
|
---|
2512 | }
|
---|
2513 | else
|
---|
2514 | {
|
---|
2515 | search_state.best_lparen = regs->start;
|
---|
2516 | search_state.best_rparen = regs->end;
|
---|
2517 | }
|
---|
2518 | }
|
---|
2519 |
|
---|
2520 | search_state.lparen =
|
---|
2521 | (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
2522 | search_state.rparen =
|
---|
2523 | (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
2524 |
|
---|
2525 | if (! ( search_state.best_rparen
|
---|
2526 | && search_state.best_lparen
|
---|
2527 | && search_state.lparen && search_state.rparen))
|
---|
2528 | return rx_search_error;
|
---|
2529 |
|
---|
2530 | search_state.best_last_l = search_state.best_last_r = -1;
|
---|
2531 |
|
---|
2532 | search_state.translate = (rxb->translate
|
---|
2533 | ? rxb->translate
|
---|
2534 | : rx_id_translation);
|
---|
2535 |
|
---|
2536 |
|
---|
2537 |
|
---|
2538 | /*
|
---|
2539 | * two nfa's were compiled.
|
---|
2540 | * `0' is complete.
|
---|
2541 | * `1' faster but gets registers wrong and ends too soon.
|
---|
2542 | */
|
---|
2543 | search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1';
|
---|
2544 |
|
---|
2545 | /* we have the option to look for the best match or the first
|
---|
2546 | * one we can find. if the user isn't asking for register information,
|
---|
2547 | * we don't need to find the best match.
|
---|
2548 | */
|
---|
2549 | search_state.first_found = !regs;
|
---|
2550 |
|
---|
2551 | if (range >= 0)
|
---|
2552 | {
|
---|
2553 | search_state.outer_pos.search_end = startpos + range;
|
---|
2554 | search_state.outer_pos.search_direction = 1;
|
---|
2555 | }
|
---|
2556 | else
|
---|
2557 | {
|
---|
2558 | search_state.outer_pos.search_end = startpos + range;
|
---|
2559 | search_state.outer_pos.search_direction = -1;
|
---|
2560 | }
|
---|
2561 |
|
---|
2562 | /* the vacuous search always turns up nothing. */
|
---|
2563 | if ((search_state.outer_pos.search_direction == 1)
|
---|
2564 | ? (startpos > search_state.outer_pos.search_end)
|
---|
2565 | : (startpos < search_state.outer_pos.search_end))
|
---|
2566 | return rx_search_fail;
|
---|
2567 |
|
---|
2568 | /* now we build the starting state of the supernfa. */
|
---|
2569 | {
|
---|
2570 | struct rx_superset * start_contents;
|
---|
2571 | struct rx_nfa_state_set * start_nfa_set;
|
---|
2572 |
|
---|
2573 | /* we presume here that the nfa start state has only one
|
---|
2574 | * possible future with no side effects.
|
---|
2575 | */
|
---|
2576 | start_nfa_set = rxb->start->futures->destset;
|
---|
2577 | if ( rxb->rx.start_set
|
---|
2578 | && (rxb->rx.start_set->starts_for == &rxb->rx))
|
---|
2579 | start_contents = rxb->rx.start_set;
|
---|
2580 | else
|
---|
2581 | {
|
---|
2582 | start_contents =
|
---|
2583 | rx_superstate_eclosure_union (&rxb->rx,
|
---|
2584 | rx_superset_cons (&rxb->rx, 0, 0),
|
---|
2585 | start_nfa_set);
|
---|
2586 |
|
---|
2587 | if (!start_contents)
|
---|
2588 | return rx_search_fail;
|
---|
2589 |
|
---|
2590 | start_contents->starts_for = &rxb->rx;
|
---|
2591 | rxb->rx.start_set = start_contents;
|
---|
2592 | }
|
---|
2593 | if ( start_contents->superstate
|
---|
2594 | && (start_contents->superstate->rx_id == rxb->rx.rx_id))
|
---|
2595 | {
|
---|
2596 | search_state.start_super = start_contents->superstate;
|
---|
2597 | rx_lock_superstate (&rxb->rx, search_state.start_super);
|
---|
2598 | }
|
---|
2599 | else
|
---|
2600 | {
|
---|
2601 | rx_protect_superset (&rxb->rx, start_contents);
|
---|
2602 |
|
---|
2603 | search_state.start_super = rx_superstate (&rxb->rx, start_contents);
|
---|
2604 | if (!search_state.start_super)
|
---|
2605 | return rx_search_fail;
|
---|
2606 | rx_lock_superstate (&rxb->rx, search_state.start_super);
|
---|
2607 | rx_release_superset (&rxb->rx, start_contents);
|
---|
2608 | }
|
---|
2609 | }
|
---|
2610 |
|
---|
2611 |
|
---|
2612 | /* The outer_pos tracks the position within the strings
|
---|
2613 | * as seen by loop that calls fastmap_search.
|
---|
2614 | *
|
---|
2615 | * The caller supplied get_burst function actually
|
---|
2616 | * gives us pointers to chars.
|
---|
2617 | *
|
---|
2618 | * Communication with the get_burst function is through an
|
---|
2619 | * rx_string_position structure. Here, the structure for
|
---|
2620 | * outer_pos is initialized. It is set to point to the
|
---|
2621 | * NULL string, at an offset of STARTPOS. STARTPOS is out
|
---|
2622 | * of range of the NULL string, so the first call to
|
---|
2623 | * getburst will patch up the rx_string_position to point
|
---|
2624 | * to valid characters.
|
---|
2625 | */
|
---|
2626 |
|
---|
2627 | ( search_state.outer_pos.string
|
---|
2628 | = search_state.outer_pos.end
|
---|
2629 | = 0);
|
---|
2630 |
|
---|
2631 | search_state.outer_pos.offset = 0;
|
---|
2632 | search_state.outer_pos.size = 0;
|
---|
2633 | search_state.outer_pos.pos = (unsigned char *)startpos;
|
---|
2634 | init_fastmap (rxb, &search_state);
|
---|
2635 |
|
---|
2636 | search_state.fastmap_resume_pt = rx_fastmap_start;
|
---|
2637 | case rx_outer_fastmap:
|
---|
2638 | /* do { */
|
---|
2639 | pseudo_do:
|
---|
2640 | {
|
---|
2641 | {
|
---|
2642 | int fastmap_state;
|
---|
2643 | fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure,
|
---|
2644 | &search_state);
|
---|
2645 | switch (fastmap_state)
|
---|
2646 | {
|
---|
2647 | case rx_fastmap_continuation:
|
---|
2648 | pc = rx_outer_fastmap;
|
---|
2649 | goto return_continuation;
|
---|
2650 | case rx_fastmap_fail:
|
---|
2651 | goto finish;
|
---|
2652 | case rx_fastmap_ok:
|
---|
2653 | break;
|
---|
2654 | }
|
---|
2655 | }
|
---|
2656 |
|
---|
2657 | /* now the fastmap loop has brought us to a plausible
|
---|
2658 | * starting point for a match. so, it's time to run the
|
---|
2659 | * nfa and see if a match occured.
|
---|
2660 | */
|
---|
2661 | startpos = ( search_state.outer_pos.pos
|
---|
2662 | - search_state.outer_pos.string
|
---|
2663 | + search_state.outer_pos.offset);
|
---|
2664 | /*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end))
|
---|
2665 | /*|*/ goto finish;
|
---|
2666 | }
|
---|
2667 |
|
---|
2668 | search_state.test_match_resume_pt = rx_test_start;
|
---|
2669 | /* do interrupted for entry point... */
|
---|
2670 | case rx_outer_test:
|
---|
2671 | /* ...do continued */
|
---|
2672 | {
|
---|
2673 | goto test_match;
|
---|
2674 | test_returns_to_search:
|
---|
2675 | switch (test_state)
|
---|
2676 | {
|
---|
2677 | case rx_test_continuation:
|
---|
2678 | pc = rx_outer_test;
|
---|
2679 | goto return_continuation;
|
---|
2680 | case rx_test_error:
|
---|
2681 | search_state.ret_val = rx_search_error;
|
---|
2682 | goto finish;
|
---|
2683 | case rx_test_fail:
|
---|
2684 | break;
|
---|
2685 | case rx_test_ok:
|
---|
2686 | goto finish;
|
---|
2687 | }
|
---|
2688 | search_state.outer_pos.pos += search_state.outer_pos.search_direction;
|
---|
2689 | startpos += search_state.outer_pos.search_direction;
|
---|
2690 | #if 0
|
---|
2691 | /*|*/ if (search_state.test_pos.pos < search_state.test_pos.end)
|
---|
2692 | /*|*/ break;
|
---|
2693 | #endif
|
---|
2694 | }
|
---|
2695 | /* do interrupted for entry point... */
|
---|
2696 | case rx_outer_restore_pos:
|
---|
2697 | {
|
---|
2698 | int x;
|
---|
2699 | x = get_burst (&search_state.outer_pos, app_closure, stop);
|
---|
2700 | switch (x)
|
---|
2701 | {
|
---|
2702 | case rx_get_burst_continuation:
|
---|
2703 | pc = rx_outer_restore_pos;
|
---|
2704 | goto return_continuation;
|
---|
2705 | case rx_get_burst_error:
|
---|
2706 | search_state.ret_val = rx_search_error;
|
---|
2707 | goto finish;
|
---|
2708 | case rx_get_burst_no_more:
|
---|
2709 | if (rxb->can_match_empty)
|
---|
2710 | break;
|
---|
2711 | goto finish;
|
---|
2712 | case rx_get_burst_ok:
|
---|
2713 | break;
|
---|
2714 | }
|
---|
2715 | } /* } while (...see below...) */
|
---|
2716 |
|
---|
2717 | if ((search_state.outer_pos.search_direction == 1)
|
---|
2718 | ? (startpos < search_state.outer_pos.search_end)
|
---|
2719 | : (startpos > search_state.outer_pos.search_end))
|
---|
2720 | goto pseudo_do;
|
---|
2721 |
|
---|
2722 |
|
---|
2723 | finish:
|
---|
2724 | uninit_fastmap (rxb, &search_state);
|
---|
2725 | if (search_state.start_super)
|
---|
2726 | rx_unlock_superstate (&rxb->rx, search_state.start_super);
|
---|
2727 |
|
---|
2728 | #ifdef regex_malloc
|
---|
2729 | if (search_state.lparen) free (search_state.lparen);
|
---|
2730 | if (search_state.rparen) free (search_state.rparen);
|
---|
2731 | if (search_state.best_lpspace) free (search_state.best_lpspace);
|
---|
2732 | if (search_state.best_rpspace) free (search_state.best_rpspace);
|
---|
2733 | #endif
|
---|
2734 | return search_state.ret_val;
|
---|
2735 | }
|
---|
2736 |
|
---|
2737 |
|
---|
2738 | test_match:
|
---|
2739 | {
|
---|
2740 | enum rx_test_match_entry test_pc;
|
---|
2741 | int inx;
|
---|
2742 | test_pc = search_state.test_match_resume_pt;
|
---|
2743 | if (test_pc == rx_test_start)
|
---|
2744 | {
|
---|
2745 | #ifdef RX_DEBUG
|
---|
2746 | search_state.backtrack_depth = 0;
|
---|
2747 | #endif
|
---|
2748 | search_state.last_l = search_state.last_r = 0;
|
---|
2749 | search_state.lparen[0] = startpos;
|
---|
2750 | search_state.super = search_state.start_super;
|
---|
2751 | search_state.c = search_state.nfa_choice;
|
---|
2752 | search_state.test_pos.pos = search_state.outer_pos.pos - 1;
|
---|
2753 | search_state.test_pos.string = search_state.outer_pos.string;
|
---|
2754 | search_state.test_pos.end = search_state.outer_pos.end;
|
---|
2755 | search_state.test_pos.offset = search_state.outer_pos.offset;
|
---|
2756 | search_state.test_pos.size = search_state.outer_pos.size;
|
---|
2757 | search_state.test_pos.search_direction = 1;
|
---|
2758 | search_state.counter_stack = 0;
|
---|
2759 | search_state.backtrack_stack = 0;
|
---|
2760 | search_state.backtrack_frame_bytes =
|
---|
2761 | (sizeof (struct rx_backtrack_frame)
|
---|
2762 | + (rxb->match_regs_on_stack
|
---|
2763 | ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2
|
---|
2764 | : 0));
|
---|
2765 | search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64;
|
---|
2766 | search_state.test_ret = rx_test_line_finished;
|
---|
2767 | search_state.could_have_continued = 0;
|
---|
2768 | }
|
---|
2769 | /* This is while (1)...except that the body of the loop is interrupted
|
---|
2770 | * by some alternative entry points.
|
---|
2771 | */
|
---|
2772 | pseudo_while_1:
|
---|
2773 | switch (test_pc)
|
---|
2774 | {
|
---|
2775 | case rx_test_cache_hit_loop:
|
---|
2776 | goto resume_continuation_1;
|
---|
2777 | case rx_test_backreference_check:
|
---|
2778 | goto resume_continuation_2;
|
---|
2779 | case rx_test_backtrack_return:
|
---|
2780 | goto resume_continuation_3;
|
---|
2781 | case rx_test_start:
|
---|
2782 | #ifdef RX_DEBUG
|
---|
2783 | /* There is a search tree with every node as set of deterministic
|
---|
2784 | * transitions in the super nfa. For every branch of a
|
---|
2785 | * backtrack point is an edge in the tree.
|
---|
2786 | * This counts up a pre-order of nodes in that tree.
|
---|
2787 | * It's saved on the search stack and printed when debugging.
|
---|
2788 | */
|
---|
2789 | search_state.line_no = 0;
|
---|
2790 | search_state.lines_found = 0;
|
---|
2791 | #endif
|
---|
2792 |
|
---|
2793 | top_of_cycle:
|
---|
2794 | /* A superstate is basicly a transition table, indexed by
|
---|
2795 | * characters from the string being tested, and containing
|
---|
2796 | * RX_INX (`instruction frame') structures.
|
---|
2797 | */
|
---|
2798 | search_state.ifr = &search_state.super->transitions [search_state.c];
|
---|
2799 |
|
---|
2800 | recurse_test_match:
|
---|
2801 | /* This is the point to which control is sent when the
|
---|
2802 | * test matcher `recurses'. Before jumping here, some variables
|
---|
2803 | * need to be saved on the stack and the next instruction frame
|
---|
2804 | * has to be computed.
|
---|
2805 | */
|
---|
2806 |
|
---|
2807 | restart:
|
---|
2808 | /* Some instructions don't advance the matcher, but just
|
---|
2809 | * carry out some side effects and fetch a new instruction.
|
---|
2810 | * To dispatch that new instruction, `goto restart'.
|
---|
2811 | */
|
---|
2812 |
|
---|
2813 | {
|
---|
2814 | struct rx_inx * next_tr_table;
|
---|
2815 | struct rx_inx * this_tr_table;
|
---|
2816 | /* The fastest route through the loop is when the instruction
|
---|
2817 | * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA
|
---|
2818 | * is non-zero. In that case, it points to the next
|
---|
2819 | * superstate.
|
---|
2820 | *
|
---|
2821 | * This allows us to not bother fetching the bytecode.
|
---|
2822 | */
|
---|
2823 | next_tr_table = (struct rx_inx *)search_state.ifr->data;
|
---|
2824 | this_tr_table = search_state.super->transitions;
|
---|
2825 | while (next_tr_table)
|
---|
2826 | {
|
---|
2827 | #ifdef RX_DEBUG_0
|
---|
2828 | if (rx_debug_trace)
|
---|
2829 | {
|
---|
2830 | struct rx_superset * setp;
|
---|
2831 |
|
---|
2832 | fprintf (stderr, "%d %d>> re_next_char @ %d (%d)",
|
---|
2833 | search_state.line_no,
|
---|
2834 | search_state.backtrack_depth,
|
---|
2835 | (search_state.test_pos.pos - search_state.test_pos.string
|
---|
2836 | + search_state.test_pos.offset), search_state.c);
|
---|
2837 |
|
---|
2838 | search_state.super =
|
---|
2839 | ((struct rx_superstate *)
|
---|
2840 | ((char *)this_tr_table
|
---|
2841 | - ((unsigned long)
|
---|
2842 | ((struct rx_superstate *)0)->transitions)));
|
---|
2843 |
|
---|
2844 | setp = search_state.super->contents;
|
---|
2845 | fprintf (stderr, " superstet (rx=%d, &=%x: ",
|
---|
2846 | rxb->rx.rx_id, setp);
|
---|
2847 | while (setp)
|
---|
2848 | {
|
---|
2849 | fprintf (stderr, "%d ", setp->id);
|
---|
2850 | setp = setp->cdr;
|
---|
2851 | }
|
---|
2852 | fprintf (stderr, "\n");
|
---|
2853 | }
|
---|
2854 | #endif
|
---|
2855 | this_tr_table = next_tr_table;
|
---|
2856 | ++search_state.test_pos.pos;
|
---|
2857 | if (search_state.test_pos.pos == search_state.test_pos.end)
|
---|
2858 | {
|
---|
2859 | int burst_state;
|
---|
2860 | try_burst_1:
|
---|
2861 | burst_state = get_burst (&search_state.test_pos,
|
---|
2862 | app_closure, stop);
|
---|
2863 | switch (burst_state)
|
---|
2864 | {
|
---|
2865 | case rx_get_burst_continuation:
|
---|
2866 | search_state.saved_this_tr_table = this_tr_table;
|
---|
2867 | search_state.saved_next_tr_table = next_tr_table;
|
---|
2868 | test_pc = rx_test_cache_hit_loop;
|
---|
2869 | goto test_return_continuation;
|
---|
2870 |
|
---|
2871 | resume_continuation_1:
|
---|
2872 | /* Continuation one jumps here to do its work: */
|
---|
2873 | search_state.saved_this_tr_table = this_tr_table;
|
---|
2874 | search_state.saved_next_tr_table = next_tr_table;
|
---|
2875 | goto try_burst_1;
|
---|
2876 |
|
---|
2877 | case rx_get_burst_ok:
|
---|
2878 | /* get_burst succeeded...keep going */
|
---|
2879 | break;
|
---|
2880 |
|
---|
2881 | case rx_get_burst_no_more:
|
---|
2882 | search_state.test_ret = rx_test_line_finished;
|
---|
2883 | search_state.could_have_continued = 1;
|
---|
2884 | goto test_do_return;
|
---|
2885 |
|
---|
2886 | case rx_get_burst_error:
|
---|
2887 | /* An error... */
|
---|
2888 | search_state.test_ret = rx_test_internal_error;
|
---|
2889 | goto test_do_return;
|
---|
2890 | }
|
---|
2891 | }
|
---|
2892 | search_state.c = *search_state.test_pos.pos;
|
---|
2893 | search_state.ifr = this_tr_table + search_state.c;
|
---|
2894 | next_tr_table = (struct rx_inx *)search_state.ifr->data;
|
---|
2895 | } /* Fast loop through cached transition tables */
|
---|
2896 |
|
---|
2897 | /* Here when we ran out of cached next-char transitions.
|
---|
2898 | * So, it will be necessary to do a more expensive
|
---|
2899 | * dispatch on the current instruction. The superstate
|
---|
2900 | * pointer is allowed to become invalid during next-char
|
---|
2901 | * transitions -- now we must bring it up to date.
|
---|
2902 | */
|
---|
2903 | search_state.super =
|
---|
2904 | ((struct rx_superstate *)
|
---|
2905 | ((char *)this_tr_table
|
---|
2906 | - ((unsigned long)
|
---|
2907 | ((struct rx_superstate *)0)->transitions)));
|
---|
2908 | }
|
---|
2909 |
|
---|
2910 | /* We've encountered an instruction other than next-char.
|
---|
2911 | * Dispatch that instruction:
|
---|
2912 | */
|
---|
2913 | inx = (int)search_state.ifr->inx;
|
---|
2914 | #ifdef RX_DEBUG_0
|
---|
2915 | if (rx_debug_trace)
|
---|
2916 | {
|
---|
2917 | struct rx_superset * setp = search_state.super->contents;
|
---|
2918 |
|
---|
2919 | fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no,
|
---|
2920 | search_state.backtrack_depth,
|
---|
2921 | inx_names[inx],
|
---|
2922 | (search_state.test_pos.pos - search_state.test_pos.string
|
---|
2923 | + (test_pos.half == 0 ? 0 : size1)), search_state.c);
|
---|
2924 |
|
---|
2925 | fprintf (stderr, " superstet (rx=%d, &=%x: ",
|
---|
2926 | rxb->rx.rx_id, setp);
|
---|
2927 | while (setp)
|
---|
2928 | {
|
---|
2929 | fprintf (stderr, "%d ", setp->id);
|
---|
2930 | setp = setp->cdr;
|
---|
2931 | }
|
---|
2932 | fprintf (stderr, "\n");
|
---|
2933 | }
|
---|
2934 | #endif
|
---|
2935 | switch ((enum rx_opcode)inx)
|
---|
2936 | {
|
---|
2937 | case rx_do_side_effects:
|
---|
2938 |
|
---|
2939 | /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon
|
---|
2940 | * edges associated with parentheses, backreferencing, etc.
|
---|
2941 | */
|
---|
2942 | {
|
---|
2943 | struct rx_distinct_future * df =
|
---|
2944 | (struct rx_distinct_future *)search_state.ifr->data_2;
|
---|
2945 | struct rx_se_list * el = df->effects;
|
---|
2946 | /* Side effects come in lists. This walks down
|
---|
2947 | * a list, dispatching.
|
---|
2948 | */
|
---|
2949 | while (el)
|
---|
2950 | {
|
---|
2951 | long effect;
|
---|
2952 | effect = (long)el->car;
|
---|
2953 | if (effect < 0)
|
---|
2954 | {
|
---|
2955 | #ifdef RX_DEBUG_0
|
---|
2956 | if (rx_debug_trace)
|
---|
2957 | {
|
---|
2958 | struct rx_superset * setp = search_state.super->contents;
|
---|
2959 |
|
---|
2960 | fprintf (stderr, "....%d %d>> %s\n", search_state.line_no,
|
---|
2961 | search_state.backtrack_depth,
|
---|
2962 | efnames[-effect]);
|
---|
2963 | }
|
---|
2964 | #endif
|
---|
2965 | switch ((enum re_side_effects) effect)
|
---|
2966 |
|
---|
2967 | {
|
---|
2968 | case re_se_pushback:
|
---|
2969 | search_state.ifr = &df->future_frame;
|
---|
2970 | if (!search_state.ifr->data)
|
---|
2971 | {
|
---|
2972 | struct rx_superstate * sup;
|
---|
2973 | sup = search_state.super;
|
---|
2974 | rx_lock_superstate (rx, sup);
|
---|
2975 | if (!rx_handle_cache_miss (&rxb->rx,
|
---|
2976 | search_state.super,
|
---|
2977 | search_state.c,
|
---|
2978 | (search_state.ifr
|
---|
2979 | ->data_2)))
|
---|
2980 | {
|
---|
2981 | rx_unlock_superstate (rx, sup);
|
---|
2982 | search_state.test_ret = rx_test_internal_error;
|
---|
2983 | goto test_do_return;
|
---|
2984 | }
|
---|
2985 | rx_unlock_superstate (rx, sup);
|
---|
2986 | }
|
---|
2987 | /* --search_state.test_pos.pos; */
|
---|
2988 | search_state.c = 't';
|
---|
2989 | search_state.super
|
---|
2990 | = ((struct rx_superstate *)
|
---|
2991 | ((char *)search_state.ifr->data
|
---|
2992 | - (long)(((struct rx_superstate *)0)
|
---|
2993 | ->transitions)));
|
---|
2994 | goto top_of_cycle;
|
---|
2995 | break;
|
---|
2996 | case re_se_push0:
|
---|
2997 | {
|
---|
2998 | struct rx_counter_frame * old_cf
|
---|
2999 | = (search_state.counter_stack
|
---|
3000 | ? ((struct rx_counter_frame *)
|
---|
3001 | search_state.counter_stack->sp)
|
---|
3002 | : 0);
|
---|
3003 | struct rx_counter_frame * cf;
|
---|
3004 | PUSH (search_state.counter_stack,
|
---|
3005 | sizeof (struct rx_counter_frame));
|
---|
3006 | cf = ((struct rx_counter_frame *)
|
---|
3007 | search_state.counter_stack->sp);
|
---|
3008 | cf->tag = re_se_iter;
|
---|
3009 | cf->val = 0;
|
---|
3010 | cf->inherited_from = 0;
|
---|
3011 | cf->cdr = old_cf;
|
---|
3012 | break;
|
---|
3013 | }
|
---|
3014 | case re_se_fail:
|
---|
3015 | goto test_do_return;
|
---|
3016 | case re_se_begbuf:
|
---|
3017 | if (!AT_STRINGS_BEG ())
|
---|
3018 | goto test_do_return;
|
---|
3019 | break;
|
---|
3020 | case re_se_endbuf:
|
---|
3021 | if (!AT_STRINGS_END ())
|
---|
3022 | goto test_do_return;
|
---|
3023 | break;
|
---|
3024 | case re_se_wordbeg:
|
---|
3025 | if ( LETTER_P (&search_state.test_pos, 1)
|
---|
3026 | && ( AT_STRINGS_BEG()
|
---|
3027 | || !LETTER_P (&search_state.test_pos, 0)))
|
---|
3028 | break;
|
---|
3029 | else
|
---|
3030 | goto test_do_return;
|
---|
3031 | case re_se_wordend:
|
---|
3032 | if ( !AT_STRINGS_BEG ()
|
---|
3033 | && LETTER_P (&search_state.test_pos, 0)
|
---|
3034 | && (AT_STRINGS_END ()
|
---|
3035 | || !LETTER_P (&search_state.test_pos, 1)))
|
---|
3036 | break;
|
---|
3037 | else
|
---|
3038 | goto test_do_return;
|
---|
3039 | case re_se_wordbound:
|
---|
3040 | if (AT_WORD_BOUNDARY (&search_state.test_pos))
|
---|
3041 | break;
|
---|
3042 | else
|
---|
3043 | goto test_do_return;
|
---|
3044 | case re_se_notwordbound:
|
---|
3045 | if (!AT_WORD_BOUNDARY (&search_state.test_pos))
|
---|
3046 | break;
|
---|
3047 | else
|
---|
3048 | goto test_do_return;
|
---|
3049 | case re_se_hat:
|
---|
3050 | if (AT_STRINGS_BEG ())
|
---|
3051 | {
|
---|
3052 | if (rxb->not_bol)
|
---|
3053 | goto test_do_return;
|
---|
3054 | else
|
---|
3055 | break;
|
---|
3056 | }
|
---|
3057 | else
|
---|
3058 | {
|
---|
3059 | char pos_c = *search_state.test_pos.pos;
|
---|
3060 | if ( (SRCH_TRANSLATE (pos_c)
|
---|
3061 | == SRCH_TRANSLATE('\n'))
|
---|
3062 | && rxb->newline_anchor)
|
---|
3063 | break;
|
---|
3064 | else
|
---|
3065 | goto test_do_return;
|
---|
3066 | }
|
---|
3067 | case re_se_dollar:
|
---|
3068 | if (AT_STRINGS_END ())
|
---|
3069 | {
|
---|
3070 | if (rxb->not_eol)
|
---|
3071 | goto test_do_return;
|
---|
3072 | else
|
---|
3073 | break;
|
---|
3074 | }
|
---|
3075 | else
|
---|
3076 | {
|
---|
3077 | if ( ( SRCH_TRANSLATE (fetch_char
|
---|
3078 | (&search_state.test_pos, 1,
|
---|
3079 | app_closure, stop))
|
---|
3080 | == SRCH_TRANSLATE ('\n'))
|
---|
3081 | && rxb->newline_anchor)
|
---|
3082 | break;
|
---|
3083 | else
|
---|
3084 | goto test_do_return;
|
---|
3085 | }
|
---|
3086 |
|
---|
3087 | case re_se_try:
|
---|
3088 | /* This is the first side effect in every
|
---|
3089 | * expression.
|
---|
3090 | *
|
---|
3091 | * FOR NO GOOD REASON...get rid of it...
|
---|
3092 | */
|
---|
3093 | break;
|
---|
3094 |
|
---|
3095 | case re_se_pushpos:
|
---|
3096 | {
|
---|
3097 | int urhere =
|
---|
3098 | ((int)(search_state.test_pos.pos
|
---|
3099 | - search_state.test_pos.string)
|
---|
3100 | + search_state.test_pos.offset);
|
---|
3101 | struct rx_counter_frame * old_cf
|
---|
3102 | = (search_state.counter_stack
|
---|
3103 | ? ((struct rx_counter_frame *)
|
---|
3104 | search_state.counter_stack->sp)
|
---|
3105 | : 0);
|
---|
3106 | struct rx_counter_frame * cf;
|
---|
3107 | PUSH(search_state.counter_stack,
|
---|
3108 | sizeof (struct rx_counter_frame));
|
---|
3109 | cf = ((struct rx_counter_frame *)
|
---|
3110 | search_state.counter_stack->sp);
|
---|
3111 | cf->tag = re_se_pushpos;
|
---|
3112 | cf->val = urhere;
|
---|
3113 | cf->inherited_from = 0;
|
---|
3114 | cf->cdr = old_cf;
|
---|
3115 | break;
|
---|
3116 | }
|
---|
3117 |
|
---|
3118 | case re_se_chkpos:
|
---|
3119 | {
|
---|
3120 | int urhere =
|
---|
3121 | ((int)(search_state.test_pos.pos
|
---|
3122 | - search_state.test_pos.string)
|
---|
3123 | + search_state.test_pos.offset);
|
---|
3124 | struct rx_counter_frame * cf
|
---|
3125 | = ((struct rx_counter_frame *)
|
---|
3126 | search_state.counter_stack->sp);
|
---|
3127 | if (cf->val == urhere)
|
---|
3128 | goto test_do_return;
|
---|
3129 | cf->val = urhere;
|
---|
3130 | break;
|
---|
3131 | }
|
---|
3132 | break;
|
---|
3133 |
|
---|
3134 | case re_se_poppos:
|
---|
3135 | POP(search_state.counter_stack,
|
---|
3136 | sizeof (struct rx_counter_frame));
|
---|
3137 | break;
|
---|
3138 |
|
---|
3139 |
|
---|
3140 | case re_se_at_dot:
|
---|
3141 | case re_se_syntax:
|
---|
3142 | case re_se_not_syntax:
|
---|
3143 | #ifdef emacs
|
---|
3144 | /*
|
---|
3145 | * this release lacks emacs support
|
---|
3146 | */
|
---|
3147 | #endif
|
---|
3148 | break;
|
---|
3149 | case re_se_win:
|
---|
3150 | case re_se_lparen:
|
---|
3151 | case re_se_rparen:
|
---|
3152 | case re_se_backref:
|
---|
3153 | case re_se_iter:
|
---|
3154 | case re_se_end_iter:
|
---|
3155 | case re_se_tv:
|
---|
3156 | case re_floogle_flap:
|
---|
3157 | search_state.ret_val = 0;
|
---|
3158 | goto test_do_return;
|
---|
3159 | }
|
---|
3160 | }
|
---|
3161 | else
|
---|
3162 | {
|
---|
3163 | #ifdef RX_DEBUG_0
|
---|
3164 | if (rx_debug_trace)
|
---|
3165 | fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no,
|
---|
3166 | search_state.backtrack_depth,
|
---|
3167 | efnames2[rxb->se_params [effect].se],
|
---|
3168 | rxb->se_params [effect].op1,
|
---|
3169 | rxb->se_params [effect].op2);
|
---|
3170 | #endif
|
---|
3171 | switch (rxb->se_params [effect].se)
|
---|
3172 | {
|
---|
3173 | case re_se_win:
|
---|
3174 | /* This side effect indicates that we've
|
---|
3175 | * found a match, though not necessarily the
|
---|
3176 | * best match. This is a fancy assignment to
|
---|
3177 | * register 0 unless the caller didn't
|
---|
3178 | * care about registers. In which case,
|
---|
3179 | * this stops the match.
|
---|
3180 | */
|
---|
3181 | {
|
---|
3182 | int urhere =
|
---|
3183 | ((int)(search_state.test_pos.pos
|
---|
3184 | - search_state.test_pos.string)
|
---|
3185 | + search_state.test_pos.offset);
|
---|
3186 |
|
---|
3187 | if ( (search_state.best_last_r < 0)
|
---|
3188 | || (urhere + 1 > search_state.best_rparen[0]))
|
---|
3189 | {
|
---|
3190 | /* Record the best known and keep
|
---|
3191 | * looking.
|
---|
3192 | */
|
---|
3193 | int x;
|
---|
3194 | for (x = 0; x <= search_state.last_l; ++x)
|
---|
3195 | search_state.best_lparen[x] = search_state.lparen[x];
|
---|
3196 | search_state.best_last_l = search_state.last_l;
|
---|
3197 | for (x = 0; x <= search_state.last_r; ++x)
|
---|
3198 | search_state.best_rparen[x] = search_state.rparen[x];
|
---|
3199 | search_state.best_rparen[0] = urhere + 1;
|
---|
3200 | search_state.best_last_r = search_state.last_r;
|
---|
3201 | }
|
---|
3202 | /* If we're not reporting the match-length
|
---|
3203 | * or other register info, we need look no
|
---|
3204 | * further.
|
---|
3205 | */
|
---|
3206 | if (search_state.first_found)
|
---|
3207 | {
|
---|
3208 | search_state.test_ret = rx_test_found_first;
|
---|
3209 | goto test_do_return;
|
---|
3210 | }
|
---|
3211 | }
|
---|
3212 | break;
|
---|
3213 | case re_se_lparen:
|
---|
3214 | {
|
---|
3215 | int urhere =
|
---|
3216 | ((int)(search_state.test_pos.pos
|
---|
3217 | - search_state.test_pos.string)
|
---|
3218 | + search_state.test_pos.offset);
|
---|
3219 |
|
---|
3220 | int reg = rxb->se_params [effect].op1;
|
---|
3221 | #if 0
|
---|
3222 | if (reg > search_state.last_l)
|
---|
3223 | #endif
|
---|
3224 | {
|
---|
3225 | search_state.lparen[reg] = urhere + 1;
|
---|
3226 | /* In addition to making this assignment,
|
---|
3227 | * we now know that lower numbered regs
|
---|
3228 | * that haven't already been assigned,
|
---|
3229 | * won't be. We make sure they're
|
---|
3230 | * filled with -1, so they can be
|
---|
3231 | * recognized as unassigned.
|
---|
3232 | */
|
---|
3233 | if (search_state.last_l < reg)
|
---|
3234 | while (++search_state.last_l < reg)
|
---|
3235 | search_state.lparen[search_state.last_l] = -1;
|
---|
3236 | }
|
---|
3237 | break;
|
---|
3238 | }
|
---|
3239 |
|
---|
3240 | case re_se_rparen:
|
---|
3241 | {
|
---|
3242 | int urhere =
|
---|
3243 | ((int)(search_state.test_pos.pos
|
---|
3244 | - search_state.test_pos.string)
|
---|
3245 | + search_state.test_pos.offset);
|
---|
3246 | int reg = rxb->se_params [effect].op1;
|
---|
3247 | search_state.rparen[reg] = urhere + 1;
|
---|
3248 | if (search_state.last_r < reg)
|
---|
3249 | {
|
---|
3250 | while (++search_state.last_r < reg)
|
---|
3251 | search_state.rparen[search_state.last_r]
|
---|
3252 | = -1;
|
---|
3253 | }
|
---|
3254 | break;
|
---|
3255 | }
|
---|
3256 |
|
---|
3257 | case re_se_backref:
|
---|
3258 | {
|
---|
3259 | int reg = rxb->se_params [effect].op1;
|
---|
3260 | if ( reg > search_state.last_r
|
---|
3261 | || search_state.rparen[reg] < 0)
|
---|
3262 | goto test_do_return;
|
---|
3263 |
|
---|
3264 | {
|
---|
3265 | int backref_status;
|
---|
3266 | check_backreference:
|
---|
3267 | backref_status
|
---|
3268 | = back_check (&search_state.test_pos,
|
---|
3269 | search_state.lparen[reg],
|
---|
3270 | search_state.rparen[reg],
|
---|
3271 | search_state.translate,
|
---|
3272 | app_closure,
|
---|
3273 | stop);
|
---|
3274 | switch (backref_status)
|
---|
3275 | {
|
---|
3276 | case rx_back_check_continuation:
|
---|
3277 | search_state.saved_reg = reg;
|
---|
3278 | test_pc = rx_test_backreference_check;
|
---|
3279 | goto test_return_continuation;
|
---|
3280 | resume_continuation_2:
|
---|
3281 | reg = search_state.saved_reg;
|
---|
3282 | goto check_backreference;
|
---|
3283 | case rx_back_check_fail:
|
---|
3284 | /* Fail */
|
---|
3285 | goto test_do_return;
|
---|
3286 | case rx_back_check_pass:
|
---|
3287 | /* pass --
|
---|
3288 | * test_pos now advanced to last
|
---|
3289 | * char matched by backref
|
---|
3290 | */
|
---|
3291 | break;
|
---|
3292 | }
|
---|
3293 | }
|
---|
3294 | break;
|
---|
3295 | }
|
---|
3296 | case re_se_iter:
|
---|
3297 | {
|
---|
3298 | struct rx_counter_frame * csp
|
---|
3299 | = ((struct rx_counter_frame *)
|
---|
3300 | search_state.counter_stack->sp);
|
---|
3301 | if (csp->val == rxb->se_params[effect].op2)
|
---|
3302 | goto test_do_return;
|
---|
3303 | else
|
---|
3304 | ++csp->val;
|
---|
3305 | break;
|
---|
3306 | }
|
---|
3307 | case re_se_end_iter:
|
---|
3308 | {
|
---|
3309 | struct rx_counter_frame * csp
|
---|
3310 | = ((struct rx_counter_frame *)
|
---|
3311 | search_state.counter_stack->sp);
|
---|
3312 | if (csp->val < rxb->se_params[effect].op1)
|
---|
3313 | goto test_do_return;
|
---|
3314 | else
|
---|
3315 | {
|
---|
3316 | struct rx_counter_frame * source = csp;
|
---|
3317 | while (source->inherited_from)
|
---|
3318 | source = source->inherited_from;
|
---|
3319 | if (!source || !source->cdr)
|
---|
3320 | {
|
---|
3321 | POP(search_state.counter_stack,
|
---|
3322 | sizeof(struct rx_counter_frame));
|
---|
3323 | }
|
---|
3324 | else
|
---|
3325 | {
|
---|
3326 | source = source->cdr;
|
---|
3327 | csp->val = source->val;
|
---|
3328 | csp->tag = source->tag;
|
---|
3329 | csp->cdr = 0;
|
---|
3330 | csp->inherited_from = source;
|
---|
3331 | }
|
---|
3332 | }
|
---|
3333 | break;
|
---|
3334 | }
|
---|
3335 | case re_se_tv:
|
---|
3336 | /* is a noop */
|
---|
3337 | break;
|
---|
3338 | case re_se_try:
|
---|
3339 | case re_se_pushback:
|
---|
3340 | case re_se_push0:
|
---|
3341 | case re_se_pushpos:
|
---|
3342 | case re_se_chkpos:
|
---|
3343 | case re_se_poppos:
|
---|
3344 | case re_se_at_dot:
|
---|
3345 | case re_se_syntax:
|
---|
3346 | case re_se_not_syntax:
|
---|
3347 | case re_se_begbuf:
|
---|
3348 | case re_se_hat:
|
---|
3349 | case re_se_wordbeg:
|
---|
3350 | case re_se_wordbound:
|
---|
3351 | case re_se_notwordbound:
|
---|
3352 | case re_se_wordend:
|
---|
3353 | case re_se_endbuf:
|
---|
3354 | case re_se_dollar:
|
---|
3355 | case re_se_fail:
|
---|
3356 | case re_floogle_flap:
|
---|
3357 | search_state.ret_val = 0;
|
---|
3358 | goto test_do_return;
|
---|
3359 | }
|
---|
3360 | }
|
---|
3361 | el = el->cdr;
|
---|
3362 | }
|
---|
3363 | /* Now the side effects are done,
|
---|
3364 | * so get the next instruction.
|
---|
3365 | * and move on.
|
---|
3366 | */
|
---|
3367 | search_state.ifr = &df->future_frame;
|
---|
3368 | goto restart;
|
---|
3369 | }
|
---|
3370 |
|
---|
3371 | case rx_backtrack_point:
|
---|
3372 | {
|
---|
3373 | /* A backtrack point indicates that we've reached a
|
---|
3374 | * non-determinism in the superstate NFA. This is a
|
---|
3375 | * loop that exhaustively searches the possibilities.
|
---|
3376 | *
|
---|
3377 | * A backtracking strategy is used. We keep track of what
|
---|
3378 | * registers are valid so we can erase side effects.
|
---|
3379 | *
|
---|
3380 | * First, make sure there is some stack space to hold
|
---|
3381 | * our state.
|
---|
3382 | */
|
---|
3383 |
|
---|
3384 | struct rx_backtrack_frame * bf;
|
---|
3385 |
|
---|
3386 | PUSH(search_state.backtrack_stack,
|
---|
3387 | search_state.backtrack_frame_bytes);
|
---|
3388 | #ifdef RX_DEBUG_0
|
---|
3389 | ++search_state.backtrack_depth;
|
---|
3390 | #endif
|
---|
3391 |
|
---|
3392 | bf = ((struct rx_backtrack_frame *)
|
---|
3393 | search_state.backtrack_stack->sp);
|
---|
3394 | {
|
---|
3395 | bf->stk_super = search_state.super;
|
---|
3396 | /* We prevent the current superstate from being
|
---|
3397 | * deleted from the superstate cache.
|
---|
3398 | */
|
---|
3399 | rx_lock_superstate (&rxb->rx, search_state.super);
|
---|
3400 | #ifdef RX_DEBUG_0
|
---|
3401 | bf->stk_search_state.line_no = search_state.line_no;
|
---|
3402 | #endif
|
---|
3403 | bf->stk_c = search_state.c;
|
---|
3404 | bf->stk_test_pos = search_state.test_pos;
|
---|
3405 | bf->stk_last_l = search_state.last_l;
|
---|
3406 | bf->stk_last_r = search_state.last_r;
|
---|
3407 | bf->df = ((struct rx_super_edge *)
|
---|
3408 | search_state.ifr->data_2)->options;
|
---|
3409 | bf->first_df = bf->df;
|
---|
3410 | bf->counter_stack_sp = (search_state.counter_stack
|
---|
3411 | ? search_state.counter_stack->sp
|
---|
3412 | : 0);
|
---|
3413 | bf->stk_test_ret = search_state.test_ret;
|
---|
3414 | if (rxb->match_regs_on_stack)
|
---|
3415 | {
|
---|
3416 | int x;
|
---|
3417 | regoff_t * stk =
|
---|
3418 | (regoff_t *)((char *)bf + sizeof (*bf));
|
---|
3419 | for (x = 0; x <= search_state.last_l; ++x)
|
---|
3420 | stk[x] = search_state.lparen[x];
|
---|
3421 | stk += x;
|
---|
3422 | for (x = 0; x <= search_state.last_r; ++x)
|
---|
3423 | stk[x] = search_state.rparen[x];
|
---|
3424 | }
|
---|
3425 | }
|
---|
3426 |
|
---|
3427 | /* Here is a while loop whose body is mainly a function
|
---|
3428 | * call and some code to handle a return from that
|
---|
3429 | * function.
|
---|
3430 | *
|
---|
3431 | * From here on for the rest of `case backtrack_point' it
|
---|
3432 | * is unsafe to assume that the search_state copies of
|
---|
3433 | * variables saved on the backtracking stack are valid
|
---|
3434 | * -- so read their values from the backtracking stack.
|
---|
3435 | *
|
---|
3436 | * This lets us use one generation fewer stack saves in
|
---|
3437 | * the call-graph of a search.
|
---|
3438 | */
|
---|
3439 |
|
---|
3440 | while_non_det_options:
|
---|
3441 | #ifdef RX_DEBUG_0
|
---|
3442 | ++search_state.lines_found;
|
---|
3443 | if (rx_debug_trace)
|
---|
3444 | fprintf (stderr, "@@@ %d calls %d @@@\n",
|
---|
3445 | search_state.line_no, search_state.lines_found);
|
---|
3446 |
|
---|
3447 | search_state.line_no = search_state.lines_found;
|
---|
3448 | #endif
|
---|
3449 |
|
---|
3450 | if (bf->df->next_same_super_edge[0] == bf->first_df)
|
---|
3451 | {
|
---|
3452 | /* This is a tail-call optimization -- we don't recurse
|
---|
3453 | * for the last of the possible futures.
|
---|
3454 | */
|
---|
3455 | search_state.ifr = (bf->df->effects
|
---|
3456 | ? &bf->df->side_effects_frame
|
---|
3457 | : &bf->df->future_frame);
|
---|
3458 |
|
---|
3459 | rx_unlock_superstate (&rxb->rx, search_state.super);
|
---|
3460 | POP(search_state.backtrack_stack,
|
---|
3461 | search_state.backtrack_frame_bytes);
|
---|
3462 | #ifdef RX_DEBUG
|
---|
3463 | --search_state.backtrack_depth;
|
---|
3464 | #endif
|
---|
3465 | goto restart;
|
---|
3466 | }
|
---|
3467 | else
|
---|
3468 | {
|
---|
3469 | if (search_state.counter_stack)
|
---|
3470 | {
|
---|
3471 | struct rx_counter_frame * old_cf
|
---|
3472 | = ((struct rx_counter_frame *)search_state.counter_stack->sp);
|
---|
3473 | struct rx_counter_frame * cf;
|
---|
3474 | PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame));
|
---|
3475 | cf = ((struct rx_counter_frame *)search_state.counter_stack->sp);
|
---|
3476 | cf->tag = old_cf->tag;
|
---|
3477 | cf->val = old_cf->val;
|
---|
3478 | cf->inherited_from = old_cf;
|
---|
3479 | cf->cdr = 0;
|
---|
3480 | }
|
---|
3481 | /* `Call' this test-match block */
|
---|
3482 | search_state.ifr = (bf->df->effects
|
---|
3483 | ? &bf->df->side_effects_frame
|
---|
3484 | : &bf->df->future_frame);
|
---|
3485 | goto recurse_test_match;
|
---|
3486 | }
|
---|
3487 |
|
---|
3488 | /* Returns in this block are accomplished by
|
---|
3489 | * goto test_do_return. There are two cases.
|
---|
3490 | * If there is some search-stack left,
|
---|
3491 | * then it is a return from a `recursive' call.
|
---|
3492 | * If there is no search-stack left, then
|
---|
3493 | * we should return to the fastmap/search loop.
|
---|
3494 | */
|
---|
3495 |
|
---|
3496 | test_do_return:
|
---|
3497 |
|
---|
3498 | if (!search_state.backtrack_stack)
|
---|
3499 | {
|
---|
3500 | #ifdef RX_DEBUG_0
|
---|
3501 | if (rx_debug_trace)
|
---|
3502 | fprintf (stderr, "!!! %d bails returning %d !!!\n",
|
---|
3503 | search_state.line_no, search_state.test_ret);
|
---|
3504 | #endif
|
---|
3505 |
|
---|
3506 | /* No more search-stack -- this test is done. */
|
---|
3507 | if (search_state.test_ret)
|
---|
3508 | goto return_from_test_match;
|
---|
3509 | else
|
---|
3510 | goto error_in_testing_match;
|
---|
3511 | }
|
---|
3512 |
|
---|
3513 | /* Returning from a recursive call to
|
---|
3514 | * the test match block:
|
---|
3515 | */
|
---|
3516 |
|
---|
3517 | bf = ((struct rx_backtrack_frame *)
|
---|
3518 | search_state.backtrack_stack->sp);
|
---|
3519 | #ifdef RX_DEBUG_0
|
---|
3520 | if (rx_debug_trace)
|
---|
3521 | fprintf (stderr, "+++ %d returns %d (to %d)+++\n",
|
---|
3522 | search_state.line_no,
|
---|
3523 | search_state.test_ret,
|
---|
3524 | bf->stk_search_state.line_no);
|
---|
3525 | #endif
|
---|
3526 |
|
---|
3527 | while (search_state.counter_stack
|
---|
3528 | && (!bf->counter_stack_sp
|
---|
3529 | || (bf->counter_stack_sp
|
---|
3530 | != search_state.counter_stack->sp)))
|
---|
3531 | {
|
---|
3532 | POP(search_state.counter_stack,
|
---|
3533 | sizeof (struct rx_counter_frame));
|
---|
3534 | }
|
---|
3535 |
|
---|
3536 | if (search_state.test_ret == rx_test_error)
|
---|
3537 | {
|
---|
3538 | POP (search_state.backtrack_stack,
|
---|
3539 | search_state.backtrack_frame_bytes);
|
---|
3540 | goto test_do_return;
|
---|
3541 | }
|
---|
3542 |
|
---|
3543 | /* If a non-longest match was found and that is good
|
---|
3544 | * enough, return immediately.
|
---|
3545 | */
|
---|
3546 | if ( (search_state.test_ret == rx_test_found_first)
|
---|
3547 | && search_state.first_found)
|
---|
3548 | {
|
---|
3549 | rx_unlock_superstate (&rxb->rx, bf->stk_super);
|
---|
3550 | POP (search_state.backtrack_stack,
|
---|
3551 | search_state.backtrack_frame_bytes);
|
---|
3552 | goto test_do_return;
|
---|
3553 | }
|
---|
3554 |
|
---|
3555 | search_state.test_ret = bf->stk_test_ret;
|
---|
3556 | search_state.last_l = bf->stk_last_l;
|
---|
3557 | search_state.last_r = bf->stk_last_r;
|
---|
3558 | bf->df = bf->df->next_same_super_edge[0];
|
---|
3559 | search_state.super = bf->stk_super;
|
---|
3560 | search_state.c = bf->stk_c;
|
---|
3561 | #ifdef RX_DEBUG_0
|
---|
3562 | search_state.line_no = bf->stk_search_state.line_no;
|
---|
3563 | #endif
|
---|
3564 |
|
---|
3565 | if (rxb->match_regs_on_stack)
|
---|
3566 | {
|
---|
3567 | int x;
|
---|
3568 | regoff_t * stk =
|
---|
3569 | (regoff_t *)((char *)bf + sizeof (*bf));
|
---|
3570 | for (x = 0; x <= search_state.last_l; ++x)
|
---|
3571 | search_state.lparen[x] = stk[x];
|
---|
3572 | stk += x;
|
---|
3573 | for (x = 0; x <= search_state.last_r; ++x)
|
---|
3574 | search_state.rparen[x] = stk[x];
|
---|
3575 | }
|
---|
3576 |
|
---|
3577 | {
|
---|
3578 | int x;
|
---|
3579 | try_burst_2:
|
---|
3580 | x = get_burst (&bf->stk_test_pos, app_closure, stop);
|
---|
3581 | switch (x)
|
---|
3582 | {
|
---|
3583 | case rx_get_burst_continuation:
|
---|
3584 | search_state.saved_bf = bf;
|
---|
3585 | test_pc = rx_test_backtrack_return;
|
---|
3586 | goto test_return_continuation;
|
---|
3587 | resume_continuation_3:
|
---|
3588 | bf = search_state.saved_bf;
|
---|
3589 | goto try_burst_2;
|
---|
3590 | case rx_get_burst_no_more:
|
---|
3591 | /* Since we've been here before, it is some kind of
|
---|
3592 | * error that we can't return.
|
---|
3593 | */
|
---|
3594 | case rx_get_burst_error:
|
---|
3595 | search_state.test_ret = rx_test_internal_error;
|
---|
3596 | goto test_do_return;
|
---|
3597 | case rx_get_burst_ok:
|
---|
3598 | break;
|
---|
3599 | }
|
---|
3600 | }
|
---|
3601 | search_state.test_pos = bf->stk_test_pos;
|
---|
3602 | goto while_non_det_options;
|
---|
3603 | }
|
---|
3604 |
|
---|
3605 |
|
---|
3606 | case rx_cache_miss:
|
---|
3607 | /* Because the superstate NFA is lazily constructed,
|
---|
3608 | * and in fact may erode from underneath us, we sometimes
|
---|
3609 | * have to construct the next instruction from the hard way.
|
---|
3610 | * This invokes one step in the lazy-conversion.
|
---|
3611 | */
|
---|
3612 | search_state.ifr = rx_handle_cache_miss (&rxb->rx,
|
---|
3613 | search_state.super,
|
---|
3614 | search_state.c,
|
---|
3615 | search_state.ifr->data_2);
|
---|
3616 | if (!search_state.ifr)
|
---|
3617 | {
|
---|
3618 | search_state.test_ret = rx_test_internal_error;
|
---|
3619 | goto test_do_return;
|
---|
3620 | }
|
---|
3621 | goto restart;
|
---|
3622 |
|
---|
3623 | case rx_backtrack:
|
---|
3624 | /* RX_BACKTRACK means that we've reached the empty
|
---|
3625 | * superstate, indicating that match can't succeed
|
---|
3626 | * from this point.
|
---|
3627 | */
|
---|
3628 | goto test_do_return;
|
---|
3629 |
|
---|
3630 | case rx_next_char:
|
---|
3631 | case rx_error_inx:
|
---|
3632 | case rx_num_instructions:
|
---|
3633 | search_state.ret_val = 0;
|
---|
3634 | goto test_do_return;
|
---|
3635 | }
|
---|
3636 | goto pseudo_while_1;
|
---|
3637 | }
|
---|
3638 |
|
---|
3639 | /* Healthy exits from the test-match loop do a
|
---|
3640 | * `goto return_from_test_match' On the other hand,
|
---|
3641 | * we might end up here.
|
---|
3642 | */
|
---|
3643 | error_in_testing_match:
|
---|
3644 | test_state = rx_test_error;
|
---|
3645 | goto test_returns_to_search;
|
---|
3646 |
|
---|
3647 | /***** fastmap/search loop body
|
---|
3648 | * considering the results testing for a match
|
---|
3649 | */
|
---|
3650 |
|
---|
3651 | return_from_test_match:
|
---|
3652 |
|
---|
3653 | if (search_state.best_last_l >= 0)
|
---|
3654 | {
|
---|
3655 | if (regs && (regs->start != search_state.best_lparen))
|
---|
3656 | {
|
---|
3657 | bcopy (search_state.best_lparen, regs->start,
|
---|
3658 | regs->num_regs * sizeof (int));
|
---|
3659 | bcopy (search_state.best_rparen, regs->end,
|
---|
3660 | regs->num_regs * sizeof (int));
|
---|
3661 | }
|
---|
3662 | if (regs && !rxb->no_sub)
|
---|
3663 | {
|
---|
3664 | int q;
|
---|
3665 | int bound = (regs->num_regs > search_state.num_regs
|
---|
3666 | ? regs->num_regs
|
---|
3667 | : search_state.num_regs);
|
---|
3668 | regoff_t * s = regs->start;
|
---|
3669 | regoff_t * e = regs->end;
|
---|
3670 | for (q = search_state.best_last_l + 1; q < bound; ++q)
|
---|
3671 | s[q] = e[q] = -1;
|
---|
3672 | }
|
---|
3673 | search_state.ret_val = search_state.best_lparen[0];
|
---|
3674 | test_state = rx_test_ok;
|
---|
3675 | goto test_returns_to_search;
|
---|
3676 | }
|
---|
3677 | else
|
---|
3678 | {
|
---|
3679 | test_state = rx_test_fail;
|
---|
3680 | goto test_returns_to_search;
|
---|
3681 | }
|
---|
3682 |
|
---|
3683 | test_return_continuation:
|
---|
3684 | search_state.test_match_resume_pt = test_pc;
|
---|
3685 | test_state = rx_test_continuation;
|
---|
3686 | goto test_returns_to_search;
|
---|
3687 | }
|
---|
3688 | }
|
---|
3689 |
|
---|
3690 |
|
---|
3691 |
|
---|
3692 | #endif /* RX_WANT_RX_DEFS */
|
---|
3693 |
|
---|
3694 |
|
---|
3695 |
|
---|
3696 | #else /* RX_WANT_SE_DEFS */
|
---|
3697 | /* Integers are used to represent side effects.
|
---|
3698 | *
|
---|
3699 | * Simple side effects are given negative integer names by these enums.
|
---|
3700 | *
|
---|
3701 | * Non-negative names are reserved for complex effects.
|
---|
3702 | *
|
---|
3703 | * Complex effects are those that take arguments. For example,
|
---|
3704 | * a register assignment associated with a group is complex because
|
---|
3705 | * it requires an argument to tell which group is being matched.
|
---|
3706 | *
|
---|
3707 | * The integer name of a complex effect is an index into rxb->se_params.
|
---|
3708 | */
|
---|
3709 |
|
---|
3710 | RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */
|
---|
3711 |
|
---|
3712 | RX_DEF_SE(0, re_se_pushback, = re_se_try - 1)
|
---|
3713 | RX_DEF_SE(0, re_se_push0, = re_se_pushback -1)
|
---|
3714 | RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1)
|
---|
3715 | RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1)
|
---|
3716 | RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1)
|
---|
3717 |
|
---|
3718 | RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */
|
---|
3719 | RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */
|
---|
3720 | RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */
|
---|
3721 |
|
---|
3722 | RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */
|
---|
3723 | RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */
|
---|
3724 |
|
---|
3725 | RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1)
|
---|
3726 | RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1)
|
---|
3727 | RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1)
|
---|
3728 |
|
---|
3729 | RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1)
|
---|
3730 | RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1)
|
---|
3731 |
|
---|
3732 | /* This fails except at the end of a line.
|
---|
3733 | * It deserves to go here since it is typicly one of the last steps
|
---|
3734 | * in a match.
|
---|
3735 | */
|
---|
3736 | RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1)
|
---|
3737 |
|
---|
3738 | /* Simple effects: */
|
---|
3739 | RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1)
|
---|
3740 |
|
---|
3741 | /* Complex effects. These are used in the 'se' field of
|
---|
3742 | * a struct re_se_params. Indexes into the se array
|
---|
3743 | * are stored as instructions on nfa edges.
|
---|
3744 | */
|
---|
3745 | RX_DEF_CPLX_SE(1, re_se_win, = 0)
|
---|
3746 | RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1)
|
---|
3747 | RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1)
|
---|
3748 | RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1)
|
---|
3749 | RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1)
|
---|
3750 | RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1)
|
---|
3751 | RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1)
|
---|
3752 |
|
---|
3753 | #endif
|
---|
3754 |
|
---|
3755 | #endif
|
---|