source: gsdl/trunk/trunk/mg/lib/rx.h@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 113.4 KB
Line 
1#if !defined(RXH) || defined(RX_WANT_SE_DEFS)
2#define RXH
3
4/* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
5
6This file is part of the librx library.
7
8Librx is free software; you can redistribute it and/or modify it under
9the terms of the GNU Library General Public License as published by
10the Free Software Foundation; either version 2, or (at your option)
11any later version.
12
13Librx is distributed in the hope that it will be useful, but WITHOUT
14ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU Library General Public
19License along with this software; see the file COPYING.LIB. If not,
20write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
2102139, USA. */
22/* t. lord Wed Sep 23 18:20:57 1992 */
23
24
25
26
27
28
29
30
31
32#ifndef RX_WANT_SE_DEFS
33
34/* This page: Bitsets */
35
36#ifndef RX_subset
37typedef unsigned int RX_subset;
38#define RX_subset_bits (32)
39#define RX_subset_mask (RX_subset_bits - 1)
40#endif
41
42typedef RX_subset * rx_Bitset;
43
44#ifdef __STDC__
45typedef void (*rx_bitset_iterator) (void *, int member_index);
46#else
47typedef void (*rx_bitset_iterator) ();
48#endif
49
50#define rx_bitset_subset(N) ((N) / RX_subset_bits)
51#define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)])
52#define RX_bitset_access(B,N,OP) \
53 ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
54#define RX_bitset_member(B,N) RX_bitset_access(B, N, &)
55#define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=)
56#define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~)
57#define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= )
58#define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
59#define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset))
60
61
62
63
64/* This page: Splay trees. */
65
66#ifdef __STDC__
67typedef int (*rx_sp_comparer) (void * a, void * b);
68#else
69typedef int (*rx_sp_comparer) ();
70#endif
71
72struct rx_sp_node
73{
74 void * key;
75 void * data;
76 struct rx_sp_node * kids[2];
77};
78
79#ifdef __STDC__
80typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
81#else
82typedef void (*rx_sp_key_data_freer) ();
83#endif
84
85
86
87/* giant inflatable hash trees */
88
89struct rx_hash_item
90{
91 struct rx_hash_item * next_same_hash;
92 struct rx_hash * table;
93 unsigned long hash;
94 void * data;
95 void * binding;
96};
97
98struct rx_hash
99{
100 struct rx_hash * parent;
101 int refs;
102 struct rx_hash * children[13];
103 struct rx_hash_item * buckets [13];
104 int bucket_size [13];
105};
106
107struct rx_hash_rules;
108
109#ifdef __STDC__
110/* should return like == */
111typedef int (*rx_hash_eq)(void *, void *);
112typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
113typedef void (*rx_free_hash)(struct rx_hash *,
114 struct rx_hash_rules *);
115typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
116 void *);
117typedef void (*rx_free_hash_item)(struct rx_hash_item *,
118 struct rx_hash_rules *);
119#else
120typedef int (*rx_hash_eq)();
121typedef struct rx_hash * (*rx_alloc_hash)();
122typedef void (*rx_free_hash)();
123typedef struct rx_hash_item * (*rx_alloc_hash_item)();
124typedef void (*rx_free_hash_item)();
125#endif
126
127struct rx_hash_rules
128{
129 rx_hash_eq eq;
130 rx_alloc_hash hash_alloc;
131 rx_free_hash free_hash;
132 rx_alloc_hash_item hash_item_alloc;
133 rx_free_hash_item free_hash_item;
134};
135
136
137
138/* Forward declarations */
139
140struct rx_cache;
141struct rx_superset;
142struct rx;
143struct rx_se_list;
144
145
146
147
148/*
149 * GLOSSARY
150 *
151 * regexp
152 * regular expression
153 * expression
154 * pattern - a `regular' expression. The expression
155 * need not be formally regular -- it can contain
156 * constructs that don't correspond to purely regular
157 * expressions.
158 *
159 * buffer
160 * string - the string (or strings) being searched or matched.
161 *
162 * pattern buffer - a structure of type `struct re_pattern_buffer'
163 * This in turn contains a `struct rx', which holds the
164 * NFA compiled from a pattern, as well as some of the state
165 * of a matcher using the pattern.
166 *
167 * NFA - nondeterministic finite automata. Some people
168 * use this term to a member of the class of
169 * regular automata (those corresponding to a regular
170 * language). However, in this code, the meaning is
171 * more general. The automata used by Rx are comperable
172 * in power to what are usually called `push down automata'.
173 *
174 * Two NFA are built by rx for every pattern. One is built
175 * by the compiler. The other is built from the first, on
176 * the fly, by the matcher. The latter is called the `superstate
177 * NFA' because its states correspond to sets of states from
178 * the first NFA. (Joe Keane gets credit for the name
179 * `superstate NFA').
180 *
181 * NFA edges
182 * epsilon edges
183 * side-effect edges - The NFA compiled from a pattern can have three
184 * kinds of edges. Epsilon edges can be taken freely anytime
185 * their source state is reached. Character set edges can be
186 * taken when their source state is reached and when the next
187 * character in the buffer is a member of the set. Side effect
188 * edges imply a transition that can only be taken after the
189 * indicated side effect has been successfully accomplished.
190 * Some examples of side effects are:
191 *
192 * Storing the current match position to record the
193 * location of a parentesized subexpression.
194 *
195 * Advancing the matcher over N characters if they
196 * match the N characters previously matched by a
197 * parentesized subexpression.
198 *
199 * Both of those kinds of edges occur in the NFA generated
200 * by the pattern: \(.\)\1
201 *
202 * Epsilon and side effect edges are similar. Unfortunately,
203 * some of the code uses the name `epsilon edge' to mean
204 * both epsilon and side effect edges. For example, the
205 * function has_non_idempotent_epsilon_path computes the existance
206 * of a non-trivial path containing only a mix of epsilon and
207 * side effect edges. In that case `nonidempotent epsilon' is being
208 * used to mean `side effect'.
209 */
210
211
212
213
214
215
216/* LOW LEVEL PATTERN BUFFERS */
217
218/* Suppose that from some NFA state, more than one path through
219 * side-effect edges is possible. In what order should the paths
220 * be tried? A function of type rx_se_list_order answers that
221 * question. It compares two lists of side effects, and says
222 * which list comes first.
223 */
224
225#ifdef __STDC__
226typedef int (*rx_se_list_order) (struct rx *,
227 struct rx_se_list *,
228 struct rx_se_list *);
229#else
230typedef int (*rx_se_list_order) ();
231#endif
232
233
234
235/* Struct RX holds a compiled regular expression - that is, an nfa
236 * ready to be converted on demand to a more efficient superstate nfa.
237 * This is for the low level interface. The high-level interfaces enclose
238 * this in a `struct re_pattern_buffer'.
239 */
240struct rx
241{
242 /* The compiler assigns a unique id to every pattern.
243 * Like sequence numbers in X, there is a subtle bug here
244 * if you use Rx in a system that runs for a long time.
245 * But, because of the way the caches work out, it is almost
246 * impossible to trigger the Rx version of this bug.
247 *
248 * The id is used to validate superstates found in a cache
249 * of superstates. It isn't sufficient to let a superstate
250 * point back to the rx for which it was compiled -- the caller
251 * may be re-using a `struct rx' in which case the superstate
252 * is not really valid. So instead, superstates are validated
253 * by checking the sequence number of the pattern for which
254 * they were built.
255 */
256 int rx_id;
257
258 /* This is memory mgt. state for superstates. This may be
259 * shared by more than one struct rx.
260 */
261 struct rx_cache * cache;
262
263 /* Every regex defines the size of its own character set.
264 * A superstate has an array of this size, with each element
265 * a `struct rx_inx'. So, don't make this number too large.
266 * In particular, don't make it 2^16.
267 */
268 int local_cset_size;
269
270 /* After the NFA is built, it is copied into a contiguous region
271 * of memory (mostly for compatability with GNU regex).
272 * Here is that region, and it's size:
273 */
274 void * buffer;
275 unsigned long allocated;
276
277 /* Clients of RX can ask for some extra storage in the space pointed
278 * to by BUFFER. The field RESERVED is an input parameter to the
279 * compiler. After compilation, this much space will be available
280 * at (buffer + allocated - reserved)
281 */
282 unsigned long reserved;
283
284 /* --------- The remaining fields are for internal use only. --------- */
285 /* --------- But! they must be initialized to 0. --------- */
286
287 /* NODEC is the number of nodes in the NFA with non-epsilon
288 * transitions.
289 */
290 int nodec;
291
292 /* EPSNODEC is the number of nodes with only epsilon transitions. */
293 int epsnodec;
294
295 /* The sum (NODEC + EPSNODEC) is the total number of states in the
296 * compiled NFA.
297 */
298
299 /* Lists of side effects as stored in the NFA are `hash consed'..meaning
300 * that lists with the same elements are ==. During compilation,
301 * this table facilitates hash-consing.
302 */
303 struct rx_hash se_list_memo;
304
305 /* Lists of NFA states are also hashed.
306 */
307 struct rx_hash set_list_memo;
308
309
310
311
312 /* The compiler and matcher must build a number of instruction frames.
313 * The format of these frames is fixed (c.f. struct rx_inx). The values
314 * of the instructions is not fixed.
315 *
316 * An enumerated type (enum rx_opcode) defines the set of instructions
317 * that the compiler or matcher might generate. When filling an instruction
318 * frame, the INX field is found by indexing this instruction table
319 * with an opcode:
320 */
321 void ** instruction_table;
322
323 /* The list of all states in an NFA.
324 * During compilation, the NEXT field of NFA states links this list.
325 * After compilation, all the states are compacted into an array,
326 * ordered by state id numbers. At that time, this points to the base
327 * of that array.
328 */
329 struct rx_nfa_state *nfa_states;
330
331 /* Every nfa begins with one distinguished starting state:
332 */
333 struct rx_nfa_state *start;
334
335 /* This orders the search through super-nfa paths.
336 * See the comment near the typedef of rx_se_list_order.
337 */
338 rx_se_list_order se_list_cmp;
339
340 struct rx_superset * start_set;
341};
342
343
344
345
346
347/* SYNTAX TREES */
348
349/* Compilation is in stages.
350 *
351 * In the first stage, a pattern specified by a string is
352 * translated into a syntax tree. Later stages will convert
353 * the syntax tree into an NFA optimized for conversion to a
354 * superstate-NFA.
355 *
356 * This page is about syntax trees.
357 */
358
359enum rexp_node_type
360{
361 r_cset, /* Match from a character set. `a' or `[a-z]'*/
362 r_concat, /* Concat two subexpressions. `ab' */
363 r_alternate, /* Choose one of two subexpressions. `a\|b' */
364 r_opt, /* Optional subexpression. `a?' */
365 r_star, /* Repeated subexpression. `a*' */
366
367
368 /* A 2phase-star is a variation on a repeated subexpression.
369 * In this case, there are two subexpressions. The first, if matched,
370 * begins a repitition (otherwise, the whole expression is matches the
371 * empth string).
372 *
373 * After matching the first subexpression, a 2phase star either finishes,
374 * or matches the second subexpression. If the second subexpression is
375 * matched, then the whole construct repeats.
376 *
377 * 2phase stars are used in two circumstances. First, they
378 * are used as part of the implementation of POSIX intervals (counted
379 * repititions). Second, they are used to implement proper star
380 * semantics when the repeated subexpression contains paths of
381 * only side effects. See rx_compile for more information.
382 */
383 r_2phase_star,
384
385
386 /* c.f. "typedef void * rx_side_effect" */
387 r_side_effect,
388
389 /* This is an extension type: It is for transient use in source->source
390 * transformations (implemented over syntax trees).
391 */
392 r_data
393};
394
395/* A side effect is a matcher-specific action associated with
396 * transitions in the NFA. The details of side effects are up
397 * to the matcher. To the compiler and superstate constructors
398 * side effects are opaque:
399 */
400
401typedef void * rx_side_effect;
402
403/* Nodes in a syntax tree are of this type:
404 */
405struct rexp_node
406{
407 enum rexp_node_type type;
408 union
409 {
410 rx_Bitset cset;
411 rx_side_effect side_effect;
412 struct
413 {
414 struct rexp_node *left;
415 struct rexp_node *right;
416 } pair;
417 void * data;
418 } params;
419};
420
421
422
423
424/* NFA
425 *
426 * A syntax tree is compiled into an NFA. This page defines the structure
427 * of that NFA.
428 */
429
430struct rx_nfa_state
431{
432 /* These are kept in a list as the NFA is being built. */
433 struct rx_nfa_state *next;
434
435 /* After the NFA is built, states are given integer id's.
436 * States whose outgoing transitions are all either epsilon or
437 * side effect edges are given ids less than 0. Other states
438 * are given successive non-negative ids starting from 0.
439 */
440 int id;
441
442 /* The list of NFA edges that go from this state to some other. */
443 struct rx_nfa_edge *edges;
444
445 /* If you land in this state, then you implicitly land
446 * in all other states reachable by only epsilon translations.
447 * Call the set of maximal paths to such states the epsilon closure
448 * of this state.
449 *
450 * There may be other states that are reachable by a mixture of
451 * epsilon and side effect edges. Consider the set of maximal paths
452 * of that sort from this state. Call it the epsilon-side-effect
453 * closure of the state.
454 *
455 * The epsilon closure of the state is a subset of the epsilon-side-
456 * effect closure. It consists of all the paths that contain
457 * no side effects -- only epsilon edges.
458 *
459 * The paths in the epsilon-side-effect closure can be partitioned
460 * into equivalance sets. Two paths are equivalant if they have the
461 * same set of side effects, in the same order. The epsilon-closure
462 * is one of these equivalance sets. Let's call these equivalance
463 * sets: observably equivalant path sets. That name is chosen
464 * because equivalance of two paths means they cause the same side
465 * effects -- so they lead to the same subsequent observations other
466 * than that they may wind up in different target states.
467 *
468 * The superstate nfa, which is derived from this nfa, is based on
469 * the observation that all of the paths in an observably equivalant
470 * path set can be explored at the same time, provided that the
471 * matcher keeps track not of a single nfa state, but of a set of
472 * states. In particular, after following all the paths in an
473 * observably equivalant set, you wind up at a set of target states.
474 * That set of target states corresponds to one state in the
475 * superstate NFA.
476 *
477 * Staticly, before matching begins, it is convenient to analyze the
478 * nfa. Each state is labeled with a list of the observably
479 * equivalant path sets who's union covers all the
480 * epsilon-side-effect paths beginning in this state. This list is
481 * called the possible futures of the state.
482 *
483 * A trivial example is this NFA:
484 * s1
485 * A ---> B
486 *
487 * s2
488 * ---> C
489 *
490 * epsilon s1
491 * ---------> D ------> E
492 *
493 *
494 * In this example, A has two possible futures.
495 * One invokes the side effect `s1' and contains two paths,
496 * one ending in state B, the other in state E.
497 * The other invokes the side effect `s2' and contains only
498 * one path, landing in state C.
499 */
500 struct rx_possible_future *futures;
501
502
503 /* There are exactly two distinguished states in every NFA: */
504 unsigned int is_final:1;
505 unsigned int is_start:1;
506
507 /* These are used during NFA construction... */
508 unsigned int eclosure_needed:1;
509 unsigned int mark:1;
510};
511
512
513/* An edge in an NFA is typed: */
514enum rx_nfa_etype
515{
516 /* A cset edge is labled with a set of characters one of which
517 * must be matched for the edge to be taken.
518 */
519 ne_cset,
520
521 /* An epsilon edge is taken whenever its starting state is
522 * reached.
523 */
524 ne_epsilon,
525
526 /* A side effect edge is taken whenever its starting state is
527 * reached. Side effects may cause the match to fail or the
528 * position of the matcher to advance.
529 */
530 ne_side_effect /* A special kind of epsilon. */
531};
532
533struct rx_nfa_edge
534{
535 struct rx_nfa_edge *next;
536 enum rx_nfa_etype type;
537 struct rx_nfa_state *dest;
538 union
539 {
540 rx_Bitset cset;
541 rx_side_effect side_effect;
542 } params;
543};
544
545
546
547/* A possible future consists of a list of side effects
548 * and a set of destination states. Below are their
549 * representations. These structures are hash-consed which
550 * means that lists with the same elements share a representation
551 * (their addresses are ==).
552 */
553
554struct rx_nfa_state_set
555{
556 struct rx_nfa_state * car;
557 struct rx_nfa_state_set * cdr;
558};
559
560struct rx_se_list
561{
562 rx_side_effect car;
563 struct rx_se_list * cdr;
564};
565
566struct rx_possible_future
567{
568 struct rx_possible_future *next;
569 struct rx_se_list * effects;
570 struct rx_nfa_state_set * destset;
571};
572
573
574
575
576/* This begins the description of the superstate NFA.
577 *
578 * The superstate NFA corresponds to the NFA in these ways:
579 *
580 * Every superstate NFA states SUPER correspond to sets of NFA states,
581 * nfa_states(SUPER).
582 *
583 * Superstate edges correspond to NFA paths.
584 *
585 * The superstate has no epsilon transitions;
586 * every edge has a character label, and a (possibly empty) side
587 * effect label. The side effect label corresponds to a list of
588 * side effects that occur in the NFA. These parts are referred
589 * to as: superedge_character(EDGE) and superedge_sides(EDGE).
590 *
591 * For a superstate edge EDGE starting in some superstate SUPER,
592 * the following is true (in pseudo-notation :-):
593 *
594 * exists DEST in nfa_states s.t.
595 * exists nfaEDGE in nfa_edges s.t.
596 * origin (nfaEDGE) == DEST
597 * && origin (nfaEDGE) is a member of nfa_states(SUPER)
598 * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
599 * sides_of_possible_future (PF) == superedge_sides (EDGE)
600 *
601 * also:
602 *
603 * let SUPER2 := superedge_destination(EDGE)
604 * nfa_states(SUPER2)
605 * == union of all nfa state sets S s.t.
606 * exists PF in possible_futures(dest(nfaEDGE)) s.t.
607 * sides_of_possible_future (PF) == superedge_sides (EDGE)
608 * && S == dests_of_possible_future (PF) }
609 *
610 * Or in english, every superstate is a set of nfa states. A given
611 * character and a superstate implies many transitions in the NFA --
612 * those that begin with an edge labeled with that character from a
613 * state in the set corresponding to the superstate.
614 *
615 * The destinations of those transitions each have a set of possible
616 * futures. A possible future is a list of side effects and a set of
617 * destination NFA states. Two sets of possible futures can be
618 * `merged' by combining all pairs of possible futures that have the
619 * same side effects. A pair is combined by creating a new future
620 * with the same side effect but the union of the two destination sets.
621 * In this way, all the possible futures suggested by a superstate
622 * and a character can be merged into a set of possible futures where
623 * no two elements of the set have the same set of side effects.
624 *
625 * The destination of a possible future, being a set of NFA states,
626 * corresponds to a supernfa state. So, the merged set of possible
627 * futures we just created can serve as a set of edges in the
628 * supernfa.
629 *
630 * The representation of the superstate nfa and the nfa is critical.
631 * The nfa has to be compact, but has to facilitate the rapid
632 * computation of missing superstates. The superstate nfa has to
633 * be fast to interpret, lazilly constructed, and bounded in space.
634 *
635 * To facilitate interpretation, the superstate data structures are
636 * peppered with `instruction frames'. There is an instruction set
637 * defined below which matchers using the supernfa must be able to
638 * interpret.
639 *
640 * We'd like to make it possible but not mandatory to use code
641 * addresses to represent instructions (c.f. gcc's computed goto).
642 * Therefore, we define an enumerated type of opcodes, and when
643 * writing one of these instructions into a data structure, use
644 * the opcode as an index into a table of instruction values.
645 *
646 * Here are the opcodes that occur in the superstate nfa:
647 */
648
649
650/* Every superstate contains a table of instruction frames indexed
651 * by characters. A normal `move' in a matcher is to fetch the next
652 * character and use it as an index into a superstates transition
653 * table.
654 *
655 * In the fasted case, only one edge follows from that character.
656 * In other cases there is more work to do.
657 *
658 * The descriptions of the opcodes refer to data structures that are
659 * described further below.
660 */
661
662enum rx_opcode
663{
664 /*
665 * BACKTRACK_POINT is invoked when a character transition in
666 * a superstate leads to more than one edge. In that case,
667 * the edges have to be explored independently using a backtracking
668 * strategy.
669 *
670 * A BACKTRACK_POINT instruction is stored in a superstate's
671 * transition table for some character when it is known that that
672 * character crosses more than one edge. On encountering this
673 * instruction, the matcher saves enough state to backtrack to this
674 * point in the match later.
675 */
676 rx_backtrack_point = 0, /* data is (struct transition_class *) */
677
678 /*
679 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
680 * There is one occurence of this instruction per rx_distinct_future.
681 * This instruction is skipped if a rx_distinct_future has no side effects.
682 */
683 rx_do_side_effects = rx_backtrack_point + 1,
684
685 /* data is (struct rx_distinct_future *) */
686
687 /*
688 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
689 * destination superstate has been reclaimed (or was never built).
690 * It recomputes the destination superstate.
691 * RX_CACHE_MISS is also stored in a superstate transition table before
692 * any of its edges have been built.
693 */
694 rx_cache_miss = rx_do_side_effects + 1,
695 /* data is (struct rx_distinct_future *) */
696
697 /*
698 * RX_NEXT_CHAR is called to consume the next character and take the
699 * corresponding transition. This is the only instruction that uses
700 * the DATA field of the instruction frame instead of DATA_2.
701 * (see EXPLORE_FUTURE in regex.c).
702 */
703 rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
704
705 /* RX_BACKTRACK indicates that a transition fails.
706 */
707 rx_backtrack = rx_next_char + 1, /* no data */
708
709 /*
710 * RX_ERROR_INX is stored only in places that should never be executed.
711 */
712 rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
713
714 rx_num_instructions = rx_error_inx + 1
715};
716
717/* An id_instruction_table holds the values stored in instruction
718 * frames. The table is indexed by the enums declared above.
719 */
720extern void * rx_id_instruction_table[rx_num_instructions];
721
722/* The heart of the matcher is a `word-code-interpreter'
723 * (like a byte-code interpreter, except that instructions
724 * are a full word wide).
725 *
726 * Instructions are not stored in a vector of code, instead,
727 * they are scattered throughout the data structures built
728 * by the regexp compiler and the matcher. One word-code instruction,
729 * together with the arguments to that instruction, constitute
730 * an instruction frame (struct rx_inx).
731 *
732 * This structure type is padded by hand to a power of 2 because
733 * in one of the dominant cases, we dispatch by indexing a table
734 * of instruction frames. If that indexing can be accomplished
735 * by just a shift of the index, we're happy.
736 *
737 * Instructions take at most one argument, but there are two
738 * slots in an instruction frame that might hold that argument.
739 * These are called data and data_2. The data slot is only
740 * used for one instruction (RX_NEXT_CHAR). For all other
741 * instructions, data should be set to 0.
742 *
743 * RX_NEXT_CHAR is the most important instruction by far.
744 * By reserving the data field for its exclusive use,
745 * instruction dispatch is sped up in that case. There is
746 * no need to fetch both the instruction and the data,
747 * only the data is needed. In other words, a `cycle' begins
748 * by fetching the field data. If that is non-0, then it must
749 * be the destination state of a next_char transition, so
750 * make that value the current state, advance the match position
751 * by one character, and start a new cycle. On the other hand,
752 * if data is 0, fetch the instruction and do a more complicated
753 * dispatch on that.
754 */
755
756struct rx_inx
757{
758 void * data;
759 void * data_2;
760 void * inx;
761 void * fnord;
762};
763
764#ifndef RX_TAIL_ARRAY
765#define RX_TAIL_ARRAY 1
766#endif
767
768/* A superstate corresponds to a set of nfa states. Those sets are
769 * represented by STRUCT RX_SUPERSET. The constructors
770 * guarantee that only one (shared) structure is created for a given set.
771 */
772struct rx_superset
773{
774 int refs; /* This is a reference counted structure. */
775
776 /* We keep these sets in a cache because (in an unpredictable way),
777 * the same set is often created again and again. But that is also
778 * problematic -- compatibility with POSIX and GNU regex requires
779 * that we not be able to tell when a program discards a particular
780 * NFA (thus invalidating the supersets created from it).
781 *
782 * But when a cache hit appears to occur, we will have in hand the
783 * nfa for which it may have happened. That is why every nfa is given
784 * its own sequence number. On a cache hit, the cache is validated
785 * by comparing the nfa sequence number to this field:
786 */
787 int id;
788
789 struct rx_nfa_state * car; /* May or may not be a valid addr. */
790 struct rx_superset * cdr;
791
792 /* If the corresponding superstate exists: */
793 struct rx_superstate * superstate;
794
795
796 /* There is another bookkeeping problem. It is expensive to
797 * compute the starting nfa state set for an nfa. So, once computed,
798 * it is cached in the `struct rx'.
799 *
800 * But, the state set can be flushed from the superstate cache.
801 * When that happens, we can't know if the corresponding `struct rx'
802 * is still alive or if it has been freed or re-used by the program.
803 * So, the cached pointer to this set in a struct rx might be invalid
804 * and we need a way to validate it.
805 *
806 * Fortunately, even if this set is flushed from the cache, it is
807 * not freed. It just goes on the free-list of supersets.
808 * So we can still examine it.
809 *
810 * So to validate a starting set memo, check to see if the
811 * starts_for field still points back to the struct rx in question,
812 * and if the ID matches the rx sequence number.
813 */
814 struct rx * starts_for;
815
816 /* This is used to link into a hash bucket so these objects can
817 * be `hash-consed'.
818 */
819 struct rx_hash_item hash_item;
820};
821
822#define rx_protect_superset(RX,CON) (++(CON)->refs)
823
824/* The terminology may be confusing (rename this structure?).
825 * Every character occurs in at most one rx_super_edge per super-state.
826 * But, that structure might have more than one option, indicating a point
827 * of non-determinism.
828 *
829 * In other words, this structure holds a list of superstate edges
830 * sharing a common starting state and character label. The edges
831 * are in the field OPTIONS. All superstate edges sharing the same
832 * starting state and character are in this list.
833 */
834struct rx_super_edge
835{
836 struct rx_super_edge *next;
837 struct rx_inx rx_backtrack_frame;
838 int cset_size;
839 rx_Bitset cset;
840 struct rx_distinct_future *options;
841};
842
843/* A superstate is a set of nfa states (RX_SUPERSET) along
844 * with a transition table. Superstates are built on demand and reclaimed
845 * without warning. To protect a superstate from this ghastly fate,
846 * use LOCK_SUPERSTATE.
847 */
848struct rx_superstate
849{
850 int rx_id; /* c.f. the id field of rx_superset */
851 int locks; /* protection from reclamation */
852
853 /* Within a superstate cache, all the superstates are kept in a big
854 * queue. The tail of the queue is the state most likely to be
855 * reclaimed. The *recyclable fields hold the queue position of
856 * this state.
857 */
858 struct rx_superstate * next_recyclable;
859 struct rx_superstate * prev_recyclable;
860
861 /* The supernfa edges that exist in the cache and that have
862 * this state as their destination are kept in this list:
863 */
864 struct rx_distinct_future * transition_refs;
865
866 /* The list of nfa states corresponding to this superstate: */
867 struct rx_superset * contents;
868
869 /* The list of edges in the cache beginning from this state. */
870 struct rx_super_edge * edges;
871
872 /* A tail of the recyclable queue is marked as semifree. A semifree
873 * state has no incoming next_char transitions -- any transition
874 * into a semifree state causes a complex dispatch with the side
875 * effect of rescuing the state from its semifree state.
876 *
877 * An alternative to this might be to make next_char more expensive,
878 * and to move a state to the head of the recyclable queue whenever
879 * it is entered. That way, popular states would never be recycled.
880 *
881 * But unilaterally making next_char more expensive actually loses.
882 * So, incoming transitions are only made expensive for states near
883 * the tail of the recyclable queue. The more cache contention
884 * there is, the more frequently a state will have to prove itself
885 * and be moved back to the front of the queue. If there is less
886 * contention, then popular states just aggregate in the front of
887 * the queue and stay there.
888 */
889 int is_semifree;
890
891
892 /* This keeps track of the size of the transition table for this
893 * state. There is a half-hearted attempt to support variable sized
894 * superstates.
895 */
896 int trans_size;
897
898 /* Indexed by characters... */
899 struct rx_inx transitions[RX_TAIL_ARRAY];
900};
901
902
903/* A list of distinct futures define the edges that leave from a
904 * given superstate on a given character. c.f. rx_super_edge.
905 */
906
907struct rx_distinct_future
908{
909 struct rx_distinct_future * next_same_super_edge[2];
910 struct rx_distinct_future * next_same_dest;
911 struct rx_distinct_future * prev_same_dest;
912 struct rx_superstate * present; /* source state */
913 struct rx_superstate * future; /* destination state */
914 struct rx_super_edge * edge;
915
916
917 /* The future_frame holds the instruction that should be executed
918 * after all the side effects are done, when it is time to complete
919 * the transition to the next state.
920 *
921 * Normally this is a next_char instruction, but it may be a
922 * cache_miss instruction as well, depending on whether or not
923 * the superstate is in the cache and semifree.
924 *
925 * If this is the only future for a given superstate/char, and
926 * if there are no side effects to be performed, this frame is
927 * not used (directly) at all. Instead, its contents are copied
928 * into the transition table of the starting state of this dist. future.
929 */
930 struct rx_inx future_frame;
931
932 struct rx_inx side_effects_frame;
933 struct rx_se_list * effects;
934};
935
936#define rx_lock_superstate(R,S) ((S)->locks++)
937#define rx_unlock_superstate(R,S) (--(S)->locks)
938
939
940
941/* This page destined for rx.h */
942
943struct rx_blocklist
944{
945 struct rx_blocklist * next;
946 int bytes;
947};
948
949struct rx_freelist
950{
951 struct rx_freelist * next;
952};
953
954struct rx_cache;
955
956#ifdef __STDC__
957typedef void (*rx_morecore_fn)(struct rx_cache *);
958#else
959typedef void (*rx_morecore_fn)();
960#endif
961
962/* You use this to control the allocation of superstate data
963 * during matching. Most of it should be initialized to 0.
964 *
965 * A MORECORE function is necessary. It should allocate
966 * a new block of memory or return 0.
967 * A default that uses malloc is called `rx_morecore'.
968 *
969 * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
970 * the system will try to allocate. The default is 128. Batch style
971 * applications that are very regexp intensive should use as high a number
972 * as possible without thrashing.
973 *
974 * The LOCAL_CSET_SIZE is the number of characters in a character set.
975 * It is therefore the number of entries in a superstate transition table.
976 * Generally, it should be 256. If your character set has 16 bits,
977 * it is better to translate your regexps into equivalent 8 bit patterns.
978 */
979
980struct rx_cache
981{
982 struct rx_hash_rules superset_hash_rules;
983
984 /* Objects are allocated by incrementing a pointer that
985 * scans across rx_blocklists.
986 */
987 struct rx_blocklist * memory;
988 struct rx_blocklist * memory_pos;
989 int bytes_left;
990 char * memory_addr;
991 rx_morecore_fn morecore;
992
993 /* Freelists. */
994 struct rx_freelist * free_superstates;
995 struct rx_freelist * free_transition_classes;
996 struct rx_freelist * free_discernable_futures;
997 struct rx_freelist * free_supersets;
998 struct rx_freelist * free_hash;
999
1000 /* Two sets of superstates -- those that are semifreed, and those
1001 * that are being used.
1002 */
1003 struct rx_superstate * lru_superstate;
1004 struct rx_superstate * semifree_superstate;
1005
1006 struct rx_superset * empty_superset;
1007
1008 int superstates;
1009 int semifree_superstates;
1010 int hits;
1011 int misses;
1012 int superstates_allowed;
1013
1014 int local_cset_size;
1015 void ** instruction_table;
1016
1017 struct rx_hash superset_table;
1018};
1019
1020
1021
1022
1023/* The lowest-level search function supports arbitrarily fragmented
1024 * strings and (optionally) suspendable/resumable searches.
1025 *
1026 * Callers have to provide a few hooks.
1027 */
1028
1029#ifndef __GNUC__
1030#ifdef __STDC__
1031#define __const__ const
1032#else
1033#define __const__
1034#endif
1035#endif
1036
1037/* This holds a matcher position */
1038struct rx_string_position
1039{
1040 __const__ unsigned char * pos; /* The current pos. */
1041 __const__ unsigned char * string; /* The current string burst. */
1042 __const__ unsigned char * end; /* First invalid position >= POS. */
1043 int offset; /* Integer address of the current burst. */
1044 int size; /* Current string's size. */
1045 int search_direction; /* 1 or -1 */
1046 int search_end; /* First position to not try. */
1047};
1048
1049
1050enum rx_get_burst_return
1051{
1052 rx_get_burst_continuation,
1053 rx_get_burst_error,
1054 rx_get_burst_ok,
1055 rx_get_burst_no_more
1056};
1057
1058
1059/* A call to get burst should make POS valid. It might be invalid
1060 * if the STRING field doesn't point to a burst that actually
1061 * contains POS.
1062 *
1063 * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to
1064 * whether or not to pad to the left. Padding to the right is always
1065 * appropriate, but need not go past the point indicated by STOP.
1066 *
1067 * If a continuation is returned, then the reentering call to
1068 * a search function will retry the get_burst.
1069 */
1070
1071#ifdef __STDC__
1072typedef enum rx_get_burst_return
1073 (*rx_get_burst_fn) (struct rx_string_position * pos,
1074 void * app_closure,
1075 int stop);
1076
1077#else
1078typedef enum rx_get_burst_return (*rx_get_burst_fn) ();
1079#endif
1080
1081
1082enum rx_back_check_return
1083{
1084 rx_back_check_continuation,
1085 rx_back_check_error,
1086 rx_back_check_pass,
1087 rx_back_check_fail
1088};
1089
1090/* Back_check should advance the position it is passed
1091 * over rparen - lparen characters and return pass iff
1092 * the characters starting at POS match those indexed
1093 * by [LPAREN..RPAREN].
1094 *
1095 * If a continuation is returned, then the reentering call to
1096 * a search function will retry the back_check.
1097 */
1098
1099#ifdef __STDC__
1100typedef enum rx_back_check_return
1101 (*rx_back_check_fn) (struct rx_string_position * pos,
1102 int lparen,
1103 int rparen,
1104 unsigned char * translate,
1105 void * app_closure,
1106 int stop);
1107
1108#else
1109typedef enum rx_back_check_return (*rx_back_check_fn) ();
1110#endif
1111
1112
1113
1114
1115/* A call to fetch_char should return the character at POS or POS + 1.
1116 * Returning continuations here isn't supported. OFFSET is either 0 or 1
1117 * and indicates which characters is desired.
1118 */
1119
1120#ifdef __STDC__
1121typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos,
1122 int offset,
1123 void * app_closure,
1124 int stop);
1125#else
1126typedef int (*rx_fetch_char_fn) ();
1127#endif
1128
1129
1130enum rx_search_return
1131{
1132 rx_search_continuation = -4,
1133 rx_search_error = -3,
1134 rx_search_soft_fail = -2, /* failed by running out of string */
1135 rx_search_fail = -1 /* failed only by reaching failure states */
1136 /* return values >= 0 indicate the position of a successful match */
1137};
1138
1139
1140
1141
1142
1143
1144
1145/* regex.h
1146 *
1147 * The remaining declarations replace regex.h.
1148 */
1149
1150/* This is an array of error messages corresponding to the error codes.
1151 */
1152extern __const__ char *re_error_msg[];
1153
1154/* If any error codes are removed, changed, or added, update the
1155 `re_error_msg' table in regex.c. */
1156typedef enum
1157{
1158 REG_NOERROR = 0, /* Success. */
1159 REG_NOMATCH, /* Didn't find a match (for regexec). */
1160
1161 /* POSIX regcomp return error codes. (In the order listed in the
1162 standard.) */
1163 REG_BADPAT, /* Invalid pattern. */
1164 REG_ECOLLATE, /* Not implemented. */
1165 REG_ECTYPE, /* Invalid character class name. */
1166 REG_EESCAPE, /* Trailing backslash. */
1167 REG_ESUBREG, /* Invalid back reference. */
1168 REG_EBRACK, /* Unmatched left bracket. */
1169 REG_EPAREN, /* Parenthesis imbalance. */
1170 REG_EBRACE, /* Unmatched \{. */
1171 REG_BADBR, /* Invalid contents of \{\}. */
1172 REG_ERANGE, /* Invalid range end. */
1173 REG_ESPACE, /* Ran out of memory. */
1174 REG_BADRPT, /* No preceding re for repetition op. */
1175
1176 /* Error codes we've added. */
1177 REG_EEND, /* Premature end. */
1178 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
1179 REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
1180} reg_errcode_t;
1181
1182/* The regex.c support, as a client of rx, defines a set of possible
1183 * side effects that can be added to the edge lables of nfa edges.
1184 * Here is the list of sidef effects in use.
1185 */
1186
1187enum re_side_effects
1188{
1189#define RX_WANT_SE_DEFS 1
1190#undef RX_DEF_SE
1191#undef RX_DEF_CPLX_SE
1192#define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE,
1193#define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE,
1194#include "rx.h"
1195#undef RX_DEF_SE
1196#undef RX_DEF_CPLX_SE
1197#undef RX_WANT_SE_DEFS
1198 re_floogle_flap = 65533
1199};
1200
1201/* These hold paramaters for the kinds of side effects that are possible
1202 * in the supported pattern languages. These include things like the
1203 * numeric bounds of {} operators and the index of paren registers for
1204 * subexpression measurement or backreferencing.
1205 */
1206struct re_se_params
1207{
1208 enum re_side_effects se;
1209 int op1;
1210 int op2;
1211};
1212
1213typedef unsigned reg_syntax_t;
1214
1215struct re_pattern_buffer
1216{
1217 struct rx rx;
1218 reg_syntax_t syntax; /* See below for syntax bit definitions. */
1219
1220 unsigned int no_sub:1; /* If set, don't return register offsets. */
1221 unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */
1222 unsigned int not_eol:1; /* match at the ends of the string. */
1223 unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/
1224 unsigned int least_subs:1; /* If set, and returning registers, return
1225 * as few values as possible. Only
1226 * backreferenced groups and group 0 (the whole
1227 * match) will be returned.
1228 */
1229
1230 /* If true, this says that the matcher should keep registers on its
1231 * backtracking stack. For many patterns, we can easily determine that
1232 * this isn't necessary.
1233 */
1234 unsigned int match_regs_on_stack:1;
1235 unsigned int search_regs_on_stack:1;
1236
1237 /* is_anchored and begbuf_only are filled in by rx_compile. */
1238 unsigned int is_anchored:1; /* Anchorded by ^? */
1239 unsigned int begbuf_only:1; /* Anchored to char position 0? */
1240
1241
1242 /* If REGS_UNALLOCATED, allocate space in the `regs' structure
1243 * for `max (RE_NREGS, re_nsub + 1)' groups.
1244 * If REGS_REALLOCATE, reallocate space if necessary.
1245 * If REGS_FIXED, use what's there.
1246 */
1247#define REGS_UNALLOCATED 0
1248#define REGS_REALLOCATE 1
1249#define REGS_FIXED 2
1250 unsigned int regs_allocated:2;
1251
1252
1253 /* Either a translate table to apply to all characters before
1254 * comparing them, or zero for no translation. The translation
1255 * is applied to a pattern when it is compiled and to a string
1256 * when it is matched.
1257 */
1258 unsigned char * translate;
1259
1260 /* If this is a valid pointer, it tells rx not to store the extents of
1261 * certain subexpressions (those corresponding to non-zero entries).
1262 * Passing 0x1 is the same as passing an array of all ones. Passing 0x0
1263 * is the same as passing an array of all zeros.
1264 * The array should contain as many entries as their are subexps in the
1265 * regexp.
1266 *
1267 * For POSIX compatability, when using regcomp and regexec this field
1268 * is zeroed and ignored.
1269 */
1270 char * syntax_parens;
1271
1272 /* Number of subexpressions found by the compiler. */
1273 size_t re_nsub;
1274
1275 void * buffer; /* Malloced memory for the nfa. */
1276 unsigned long allocated; /* Size of that memory. */
1277
1278 /* Pointer to a fastmap, if any, otherwise zero. re_search uses
1279 * the fastmap, if there is one, to skip over impossible
1280 * starting points for matches. */
1281 char *fastmap;
1282
1283 unsigned int fastmap_accurate:1; /* These three are internal. */
1284 unsigned int can_match_empty:1;
1285 struct rx_nfa_state * start; /* The nfa starting state. */
1286
1287 /* This is the list of iterator bounds for {lo,hi} constructs.
1288 * The memory pointed to is part of the rx->buffer.
1289 */
1290 struct re_se_params *se_params;
1291
1292 /* This is a bitset representation of the fastmap.
1293 * This is a true fastmap that already takes the translate
1294 * table into account.
1295 */
1296 rx_Bitset fastset;
1297};
1298
1299/* Type for byte offsets within the string. POSIX mandates this. */
1300typedef int regoff_t;
1301
1302/* This is the structure we store register match data in. See
1303 regex.texinfo for a full description of what registers match. */
1304struct re_registers
1305{
1306 unsigned num_regs;
1307 regoff_t *start;
1308 regoff_t *end;
1309};
1310
1311typedef struct re_pattern_buffer regex_t;
1312
1313/* POSIX specification for registers. Aside from the different names than
1314 `re_registers', POSIX uses an array of structures, instead of a
1315 structure of arrays. */
1316typedef struct
1317{
1318 regoff_t rm_so; /* Byte offset from string's start to substring's start. */
1319 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */
1320} regmatch_t;
1321
1322
1323
1324/* The following bits are used to determine the regexp syntax we
1325 recognize. The set/not-set meanings are chosen so that Emacs syntax
1326 remains the value 0. The bits are given in alphabetical order, and
1327 the definitions shifted by one from the previous bit; thus, when we
1328 add or remove a bit, only one other definition need change. */
1329
1330/* If this bit is not set, then \ inside a bracket expression is literal.
1331 If set, then such a \ quotes the following character. */
1332#define RE_BACKSLASH_ESCAPE_IN_LISTS (1)
1333
1334/* If this bit is not set, then + and ? are operators, and \+ and \? are
1335 literals.
1336 If set, then \+ and \? are operators and + and ? are literals. */
1337#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
1338
1339/* If this bit is set, then character classes are supported. They are:
1340 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
1341 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
1342 If not set, then character classes are not supported. */
1343#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
1344
1345/* If this bit is set, then ^ and $ are always anchors (outside bracket
1346 expressions, of course).
1347 If this bit is not set, then it depends:
1348 ^ is an anchor if it is at the beginning of a regular
1349 expression or after an open-group or an alternation operator;
1350 $ is an anchor if it is at the end of a regular expression, or
1351 before a close-group or an alternation operator.
1352
1353 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
1354 POSIX draft 11.2 says that * etc. in leading positions is undefined.
1355 We already implemented a previous draft which made those constructs
1356 invalid, though, so we haven't changed the code back. */
1357#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
1358
1359/* If this bit is set, then special characters are always special
1360 regardless of where they are in the pattern.
1361 If this bit is not set, then special characters are special only in
1362 some contexts; otherwise they are ordinary. Specifically,
1363 * + ? and intervals are only special when not after the beginning,
1364 open-group, or alternation operator. */
1365#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
1366
1367/* If this bit is set, then *, +, ?, and { cannot be first in an re or
1368 immediately after an alternation or begin-group operator. */
1369#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
1370
1371/* If this bit is set, then . matches newline.
1372 If not set, then it doesn't. */
1373#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
1374
1375/* If this bit is set, then . doesn't match NUL.
1376 If not set, then it does. */
1377#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
1378
1379/* If this bit is set, nonmatching lists [^...] do not match newline.
1380 If not set, they do. */
1381#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
1382
1383/* If this bit is set, either \{...\} or {...} defines an
1384 interval, depending on RE_NO_BK_BRACES.
1385 If not set, \{, \}, {, and } are literals. */
1386#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
1387
1388/* If this bit is set, +, ? and | aren't recognized as operators.
1389 If not set, they are. */
1390#define RE_LIMITED_OPS (RE_INTERVALS << 1)
1391
1392/* If this bit is set, newline is an alternation operator.
1393 If not set, newline is literal. */
1394#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
1395
1396/* If this bit is set, then `{...}' defines an interval, and \{ and \}
1397 are literals.
1398 If not set, then `\{...\}' defines an interval. */
1399#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
1400
1401/* If this bit is set, (...) defines a group, and \( and \) are literals.
1402 If not set, \(...\) defines a group, and ( and ) are literals. */
1403#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
1404
1405/* If this bit is set, then \<digit> matches <digit>.
1406 If not set, then \<digit> is a back-reference. */
1407#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
1408
1409/* If this bit is set, then | is an alternation operator, and \| is literal.
1410 If not set, then \| is an alternation operator, and | is literal. */
1411#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
1412
1413/* If this bit is set, then an ending range point collating higher
1414 than the starting range point, as in [z-a], is invalid.
1415 If not set, then when ending range point collates higher than the
1416 starting range point, the range is ignored. */
1417#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
1418
1419/* If this bit is set, then an unmatched ) is ordinary.
1420 If not set, then an unmatched ) is invalid. */
1421#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
1422
1423/* This global variable defines the particular regexp syntax to use (for
1424 some interfaces). When a regexp is compiled, the syntax used is
1425 stored in the pattern buffer, so changing this does not affect
1426 already-compiled regexps. */
1427extern reg_syntax_t re_syntax_options;
1428
1429
1430/* Define combinations of the above bits for the standard possibilities.
1431 (The [[[ comments delimit what gets put into the Texinfo file, so
1432 don't delete them!) */
1433/* [[[begin syntaxes]]] */
1434#define RE_SYNTAX_EMACS 0
1435
1436#define RE_SYNTAX_AWK \
1437 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
1438 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
1439 | RE_NO_BK_VAR | RE_NO_EMPTY_RANGES \
1440 | RE_UNMATCHED_RIGHT_PAREN_ORD)
1441
1442#define RE_SYNTAX_POSIX_AWK \
1443 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
1444
1445#define RE_SYNTAX_GREP \
1446 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \
1447 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \
1448 | RE_NEWLINE_ALT)
1449
1450#define RE_SYNTAX_EGREP \
1451 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \
1452 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \
1453 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \
1454 | RE_NO_BK_VBAR)
1455
1456#define RE_SYNTAX_POSIX_EGREP \
1457 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
1458
1459#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
1460
1461/* Syntax bits common to both basic and extended POSIX regex syntax. */
1462#define _RE_SYNTAX_POSIX_COMMON \
1463 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
1464 | RE_INTERVALS | RE_NO_EMPTY_RANGES)
1465
1466#define RE_SYNTAX_POSIX_BASIC \
1467 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
1468
1469/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
1470 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
1471 isn't minimal, since other operators, such as \`, aren't disabled. */
1472#define RE_SYNTAX_POSIX_MINIMAL_BASIC \
1473 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
1474
1475#define RE_SYNTAX_POSIX_EXTENDED \
1476 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
1477 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
1478 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
1479 | RE_UNMATCHED_RIGHT_PAREN_ORD)
1480
1481/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
1482 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
1483#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
1484 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
1485 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
1486 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
1487 | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD)
1488/* [[[end syntaxes]]] */
1489
1490/* Maximum number of duplicates an interval can allow. Some systems
1491 (erroneously) define this in other header files, but we want our
1492 value, so remove any previous define. */
1493#ifdef RE_DUP_MAX
1494#undef RE_DUP_MAX
1495#endif
1496#define RE_DUP_MAX ((1 << 15) - 1)
1497
1498
1499
1500/* POSIX `cflags' bits (i.e., information for `regcomp'). */
1501
1502/* If this bit is set, then use extended regular expression syntax.
1503 If not set, then use basic regular expression syntax. */
1504#define REG_EXTENDED 1
1505
1506/* If this bit is set, then ignore case when matching.
1507 If not set, then case is significant. */
1508#define REG_ICASE (REG_EXTENDED << 1)
1509
1510/* If this bit is set, then anchors do not match at newline
1511 characters in the string.
1512 If not set, then anchors do match at newlines. */
1513#define REG_NEWLINE (REG_ICASE << 1)
1514
1515/* If this bit is set, then report only success or fail in regexec.
1516 If not set, then returns differ between not matching and errors. */
1517#define REG_NOSUB (REG_NEWLINE << 1)
1518
1519
1520/* POSIX `eflags' bits (i.e., information for regexec). */
1521
1522/* If this bit is set, then the beginning-of-line operator doesn't match
1523 the beginning of the string (presumably because it's not the
1524 beginning of a line).
1525 If not set, then the beginning-of-line operator does match the
1526 beginning of the string. */
1527#define REG_NOTBOL 1
1528
1529/* Like REG_NOTBOL, except for the end-of-line. */
1530#define REG_NOTEOL (1 << 1)
1531
1532/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
1533 * `re_match_2' returns information about at least this many registers
1534 * the first time a `regs' structure is passed.
1535 *
1536 * Also, this is the greatest number of backreferenced subexpressions
1537 * allowed in a pattern being matched without caller-supplied registers.
1538 */
1539#ifndef RE_NREGS
1540#define RE_NREGS 30
1541#endif
1542
1543extern int rx_cache_bound;
1544extern char rx_version_string[];
1545
1546
1547
1548
1549#ifdef RX_WANT_RX_DEFS
1550
1551/* This is decls to the interesting subsystems and lower layers
1552 * of rx. Everything which doesn't have a public counterpart in
1553 * regex.c is declared here.
1554 */
1555
1556
1557#ifdef __STDC__
1558typedef void (*rx_hash_freefn) (struct rx_hash_item * it);
1559#else /* ndef __STDC__ */
1560typedef void (*rx_hash_freefn) ();
1561#endif /* ndef __STDC__ */
1562
1563
1564
1565
1566
1567#ifdef __STDC__
1568RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b);
1569RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b);
1570RX_DECL int rx_bitset_empty (int size, rx_Bitset set);
1571RX_DECL void rx_bitset_null (int size, rx_Bitset b);
1572RX_DECL void rx_bitset_universe (int size, rx_Bitset b);
1573RX_DECL void rx_bitset_complement (int size, rx_Bitset b);
1574RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b);
1575RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b);
1576RX_DECL void rx_bitset_intersection (int size,
1577 rx_Bitset a, rx_Bitset b);
1578RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b);
1579RX_DECL void rx_bitset_revdifference (int size,
1580 rx_Bitset a, rx_Bitset b);
1581RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b);
1582RX_DECL unsigned long rx_bitset_hash (int size, rx_Bitset b);
1583RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table,
1584 unsigned long hash,
1585 void * value,
1586 struct rx_hash_rules * rules);
1587RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table,
1588 unsigned long hash,
1589 void * value,
1590 struct rx_hash_rules * rules);
1591RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules);
1592RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
1593 struct rx_hash_rules * rules);
1594RX_DECL rx_Bitset rx_cset (struct rx *rx);
1595RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a);
1596RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c);
1597RX_DECL struct rexp_node * rexp_node (struct rx *rx,
1598 enum rexp_node_type type);
1599RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx,
1600 rx_Bitset b);
1601RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx,
1602 struct rexp_node * a,
1603 struct rexp_node * b);
1604RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx,
1605 struct rexp_node * a,
1606 struct rexp_node * b);
1607RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx,
1608 struct rexp_node * a);
1609RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx,
1610 struct rexp_node * a);
1611RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx,
1612 struct rexp_node * a,
1613 struct rexp_node * b);
1614RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx,
1615 rx_side_effect a);
1616RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx,
1617 void * a);
1618RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node);
1619RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx,
1620 struct rexp_node *node);
1621RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx);
1622RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n);
1623RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx,
1624 int id);
1625RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx,
1626 enum rx_nfa_etype type,
1627 struct rx_nfa_state *start,
1628 struct rx_nfa_state *dest);
1629RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e);
1630RX_DECL void rx_free_nfa (struct rx *rx);
1631RX_DECL int rx_build_nfa (struct rx *rx,
1632 struct rexp_node *rexp,
1633 struct rx_nfa_state **start,
1634 struct rx_nfa_state **end);
1635RX_DECL void rx_name_nfa_states (struct rx *rx);
1636RX_DECL int rx_eclose_nfa (struct rx *rx);
1637RX_DECL void rx_delete_epsilon_transitions (struct rx *rx);
1638RX_DECL int rx_compactify_nfa (struct rx *rx,
1639 void **mem, unsigned long *size);
1640RX_DECL void rx_release_superset (struct rx *rx,
1641 struct rx_superset *set);
1642RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx,
1643 struct rx_nfa_state *car, struct rx_superset *cdr);
1644RX_DECL struct rx_superset * rx_superstate_eclosure_union
1645 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl);
1646RX_DECL struct rx_superstate * rx_superstate (struct rx *rx,
1647 struct rx_superset *set);
1648RX_DECL struct rx_inx * rx_handle_cache_miss
1649 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data);
1650RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size,
1651 reg_syntax_t syntax,
1652 struct re_pattern_buffer * rxb);
1653RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb);
1654#else /* STDC */
1655RX_DECL int rx_bitset_is_equal ();
1656RX_DECL int rx_bitset_is_subset ();
1657RX_DECL int rx_bitset_empty ();
1658RX_DECL void rx_bitset_null ();
1659RX_DECL void rx_bitset_universe ();
1660RX_DECL void rx_bitset_complement ();
1661RX_DECL void rx_bitset_assign ();
1662RX_DECL void rx_bitset_union ();
1663RX_DECL void rx_bitset_intersection ();
1664RX_DECL void rx_bitset_difference ();
1665RX_DECL void rx_bitset_revdifference ();
1666RX_DECL void rx_bitset_xor ();
1667RX_DECL unsigned long rx_bitset_hash ();
1668RX_DECL struct rx_hash_item * rx_hash_find ();
1669RX_DECL struct rx_hash_item * rx_hash_store ();
1670RX_DECL void rx_hash_free ();
1671RX_DECL void rx_free_hash_table ();
1672RX_DECL rx_Bitset rx_cset ();
1673RX_DECL rx_Bitset rx_copy_cset ();
1674RX_DECL void rx_free_cset ();
1675RX_DECL struct rexp_node * rexp_node ();
1676RX_DECL struct rexp_node * rx_mk_r_cset ();
1677RX_DECL struct rexp_node * rx_mk_r_concat ();
1678RX_DECL struct rexp_node * rx_mk_r_alternate ();
1679RX_DECL struct rexp_node * rx_mk_r_opt ();
1680RX_DECL struct rexp_node * rx_mk_r_star ();
1681RX_DECL struct rexp_node * rx_mk_r_2phase_star ();
1682RX_DECL struct rexp_node * rx_mk_r_side_effect ();
1683RX_DECL struct rexp_node * rx_mk_r_data ();
1684RX_DECL void rx_free_rexp ();
1685RX_DECL struct rexp_node * rx_copy_rexp ();
1686RX_DECL struct rx_nfa_state * rx_nfa_state ();
1687RX_DECL void rx_free_nfa_state ();
1688RX_DECL struct rx_nfa_state * rx_id_to_nfa_state ();
1689RX_DECL struct rx_nfa_edge * rx_nfa_edge ();
1690RX_DECL void rx_free_nfa_edge ();
1691RX_DECL void rx_free_nfa ();
1692RX_DECL int rx_build_nfa ();
1693RX_DECL void rx_name_nfa_states ();
1694RX_DECL int rx_eclose_nfa ();
1695RX_DECL void rx_delete_epsilon_transitions ();
1696RX_DECL int rx_compactify_nfa ();
1697RX_DECL void rx_release_superset ();
1698RX_DECL struct rx_superset * rx_superset_cons ();
1699RX_DECL struct rx_superset * rx_superstate_eclosure_union ();
1700RX_DECL struct rx_superstate * rx_superstate ();
1701RX_DECL struct rx_inx * rx_handle_cache_miss ();
1702RX_DECL reg_errcode_t rx_compile ();
1703RX_DECL void rx_blow_up_fastmap ();
1704#endif /* STDC */
1705
1706
1707#endif /* RX_WANT_RX_DEFS */
1708
1709
1710
1711
1712#ifdef __STDC__
1713extern int re_search_2 (struct re_pattern_buffer *rxb,
1714 __const__ char * string1, int size1,
1715 __const__ char * string2, int size2,
1716 int startpos, int range,
1717 struct re_registers *regs,
1718 int stop);
1719extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string,
1720 int size, int startpos, int range,
1721 struct re_registers *regs);
1722extern int re_match_2 (struct re_pattern_buffer * rxb,
1723 __const__ char * string1, int size1,
1724 __const__ char * string2, int size2,
1725 int pos, struct re_registers *regs, int stop);
1726extern int re_match (struct re_pattern_buffer * rxb,
1727 __const__ char * string,
1728 int size, int pos,
1729 struct re_registers *regs);
1730extern reg_syntax_t re_set_syntax (reg_syntax_t syntax);
1731extern void re_set_registers (struct re_pattern_buffer *bufp,
1732 struct re_registers *regs,
1733 unsigned num_regs,
1734 regoff_t * starts, regoff_t * ends);
1735extern __const__ char * re_compile_pattern (__const__ char *pattern,
1736 int length,
1737 struct re_pattern_buffer * rxb);
1738extern int re_compile_fastmap (struct re_pattern_buffer * rxb);
1739extern char * re_comp (__const__ char *s);
1740extern int re_exec (__const__ char *s);
1741extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags);
1742extern int regexec (__const__ regex_t *preg, __const__ char *string,
1743 size_t nmatch, regmatch_t pmatch[],
1744 int eflags);
1745extern size_t regerror (int errcode, __const__ regex_t *preg,
1746 char *errbuf, size_t errbuf_size);
1747extern void regfree (regex_t *preg);
1748
1749#else /* STDC */
1750extern int re_search_2 ();
1751extern int re_search ();
1752extern int re_match_2 ();
1753extern int re_match ();
1754extern reg_syntax_t re_set_syntax ();
1755extern void re_set_registers ();
1756extern __const__ char * re_compile_pattern ();
1757extern int re_compile_fastmap ();
1758extern char * re_comp ();
1759extern int re_exec ();
1760extern int regcomp ();
1761extern int regexec ();
1762extern size_t regerror ();
1763extern void regfree ();
1764
1765#endif /* STDC */
1766
1767
1768
1769
1770#ifdef RX_WANT_RX_DEFS
1771
1772struct rx_counter_frame
1773{
1774 int tag;
1775 int val;
1776 struct rx_counter_frame * inherited_from; /* If this is a copy. */
1777 struct rx_counter_frame * cdr;
1778};
1779
1780struct rx_backtrack_frame
1781{
1782 char * counter_stack_sp;
1783
1784 /* A frame is used to save the matchers state when it crosses a
1785 * backtracking point. The `stk_' fields correspond to variables
1786 * in re_search_2 (just strip off thes `stk_'). They are documented
1787 * tere.
1788 */
1789 struct rx_superstate * stk_super;
1790 unsigned int stk_c;
1791 struct rx_string_position stk_test_pos;
1792 int stk_last_l;
1793 int stk_last_r;
1794 int stk_test_ret;
1795
1796 /* This is the list of options left to explore at the backtrack
1797 * point for which this frame was created.
1798 */
1799 struct rx_distinct_future * df;
1800 struct rx_distinct_future * first_df;
1801
1802#ifdef RX_DEBUG
1803 int stk_line_no;
1804#endif
1805};
1806
1807struct rx_stack_chunk
1808{
1809 struct rx_stack_chunk * next_chunk;
1810 int bytes_left;
1811 char * sp;
1812};
1813
1814enum rx_outer_entry
1815{
1816 rx_outer_start,
1817 rx_outer_fastmap,
1818 rx_outer_test,
1819 rx_outer_restore_pos
1820};
1821
1822enum rx_fastmap_return
1823{
1824 rx_fastmap_continuation,
1825 rx_fastmap_error,
1826 rx_fastmap_ok,
1827 rx_fastmap_fail
1828};
1829
1830enum rx_fastmap_entry
1831{
1832 rx_fastmap_start,
1833 rx_fastmap_string_break
1834};
1835
1836enum rx_test_return
1837{
1838 rx_test_continuation,
1839 rx_test_error,
1840 rx_test_fail,
1841 rx_test_ok
1842};
1843
1844enum rx_test_internal_return
1845{
1846 rx_test_internal_error,
1847 rx_test_found_first,
1848 rx_test_line_finished
1849};
1850
1851enum rx_test_match_entry
1852{
1853 rx_test_start,
1854 rx_test_cache_hit_loop,
1855 rx_test_backreference_check,
1856 rx_test_backtrack_return
1857};
1858
1859struct rx_search_state
1860{
1861 /* Two groups of registers are kept. The group with the register state
1862 * of the current test match, and the group that holds the state at the end
1863 * of the best known match, if any.
1864 *
1865 * For some patterns, there may also be registers saved on the stack.
1866 */
1867 unsigned num_regs; /* Includes an element for register zero. */
1868 regoff_t * lparen; /* scratch space for register returns */
1869 regoff_t * rparen;
1870 regoff_t * best_lpspace; /* in case the user doesn't want these */
1871 regoff_t * best_rpspace; /* values, we still need space to store
1872 * them. Normally, this memoryis unused
1873 * and the space pointed to by REGS is
1874 * used instead.
1875 */
1876
1877 int last_l; /* Highest index of a valid lparen. */
1878 int last_r; /* It's dual. */
1879
1880 int * best_lparen; /* This contains the best known register */
1881 int * best_rparen; /* assignments.
1882 * This may point to the same mem as
1883 * best_lpspace, or it might point to memory
1884 * passed by the caller.
1885 */
1886 int best_last_l; /* best_last_l:best_lparen::last_l:lparen */
1887 int best_last_r;
1888
1889
1890 unsigned char * translate;
1891
1892 struct rx_string_position outer_pos;
1893
1894 struct rx_superstate * start_super;
1895 int nfa_choice;
1896 int first_found; /* If true, return after finding any match. */
1897 int ret_val;
1898
1899 /* For continuations... */
1900 enum rx_outer_entry outer_search_resume_pt;
1901 struct re_pattern_buffer * saved_rxb;
1902 int saved_startpos;
1903 int saved_range;
1904 int saved_stop;
1905 int saved_total_size;
1906 rx_get_burst_fn saved_get_burst;
1907 rx_back_check_fn saved_back_check;
1908 struct re_registers * saved_regs;
1909
1910 /**
1911 ** state for fastmap
1912 **/
1913 char * fastmap;
1914 int fastmap_chr;
1915 int fastmap_val;
1916
1917 /* for continuations in the fastmap procedure: */
1918 enum rx_fastmap_entry fastmap_resume_pt;
1919
1920 /**
1921 ** state for test_match
1922 **/
1923
1924 /* The current superNFA position of the matcher. */
1925 struct rx_superstate * super;
1926
1927 /* The matcher interprets a series of instruction frames.
1928 * This is the `instruction counter' for the interpretation.
1929 */
1930 struct rx_inx * ifr;
1931
1932 /* We insert a ghost character in the string to prime
1933 * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half
1934 * keep track of the test-match position and string-half.
1935 */
1936 unsigned char c;
1937
1938 /* Position within the string. */
1939 struct rx_string_position test_pos;
1940
1941 struct rx_stack_chunk * counter_stack;
1942 struct rx_stack_chunk * backtrack_stack;
1943 int backtrack_frame_bytes;
1944 int chunk_bytes;
1945 struct rx_stack_chunk * free_chunks;
1946
1947 /* To return from this function, set test_ret and
1948 * `goto test_do_return'.
1949 *
1950 * Possible return values are:
1951 * 1 --- end of string while the superNFA is still going
1952 * 0 --- internal error (out of memory)
1953 * -1 --- search completed by reaching the superNFA fail state
1954 * -2 --- a match was found, maybe not the longest.
1955 *
1956 * When the search is complete (-1), best_last_r indicates whether
1957 * a match was found.
1958 *
1959 * -2 is return only if search_state.first_found is non-zero.
1960 *
1961 * if search_state.first_found is non-zero, a return of -1 indicates no match,
1962 * otherwise, best_last_r has to be checked.
1963 */
1964 int test_ret;
1965
1966 int could_have_continued;
1967
1968#ifdef RX_DEBUG
1969 int backtrack_depth;
1970 /* There is a search tree with every node as set of deterministic
1971 * transitions in the super nfa. For every branch of a
1972 * backtrack point is an edge in the tree.
1973 * This counts up a pre-order of nodes in that tree.
1974 * It's saved on the search stack and printed when debugging.
1975 */
1976 int line_no;
1977 int lines_found;
1978#endif
1979
1980
1981 /* For continuations within the match tester */
1982 enum rx_test_match_entry test_match_resume_pt;
1983 struct rx_inx * saved_next_tr_table;
1984 struct rx_inx * saved_this_tr_table;
1985 int saved_reg;
1986 struct rx_backtrack_frame * saved_bf;
1987
1988};
1989
1990
1991
1992extern char rx_slowmap[];
1993extern unsigned char rx_id_translation[];
1994
1995static __inline__ void
1996init_fastmap (rxb, search_state)
1997 struct re_pattern_buffer * rxb;
1998 struct rx_search_state * search_state;
1999{
2000 search_state->fastmap = (rxb->fastmap
2001 ? (char *)rxb->fastmap
2002 : (char *)rx_slowmap);
2003 /* Update the fastmap now if not correct already.
2004 * When the regexp was compiled, the fastmap was computed
2005 * and stored in a bitset. This expands the bitset into a
2006 * character array containing 1s and 0s.
2007 */
2008 if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate)
2009 rx_blow_up_fastmap (rxb);
2010 search_state->fastmap_chr = -1;
2011 search_state->fastmap_val = 0;
2012 search_state->fastmap_resume_pt = rx_fastmap_start;
2013}
2014
2015static __inline__ void
2016uninit_fastmap (rxb, search_state)
2017 struct re_pattern_buffer * rxb;
2018 struct rx_search_state * search_state;
2019{
2020 /* Unset the fastmap sentinel */
2021 if (search_state->fastmap_chr >= 0)
2022 search_state->fastmap[search_state->fastmap_chr]
2023 = search_state->fastmap_val;
2024}
2025
2026static __inline__ int
2027fastmap_search (rxb, stop, get_burst, app_closure, search_state)
2028 struct re_pattern_buffer * rxb;
2029 int stop;
2030 rx_get_burst_fn get_burst;
2031 void * app_closure;
2032 struct rx_search_state * search_state;
2033{
2034 enum rx_fastmap_entry pc;
2035
2036 if (0)
2037 {
2038 return_continuation:
2039 search_state->fastmap_resume_pt = pc;
2040 return rx_fastmap_continuation;
2041 }
2042
2043 pc = search_state->fastmap_resume_pt;
2044
2045 switch (pc)
2046 {
2047 default:
2048 return rx_fastmap_error;
2049 case rx_fastmap_start:
2050 init_fastmap_sentinal:
2051 /* For the sake of fast fastmapping, set a sentinal in the fastmap.
2052 * This sentinal will trap the fastmap loop when it reaches the last
2053 * valid character in a string half.
2054 *
2055 * This must be reset when the fastmap/search loop crosses a string
2056 * boundry, and before returning to the caller. So sometimes,
2057 * the fastmap loop is restarted with `continue', othertimes by
2058 * `goto init_fastmap_sentinal'.
2059 */
2060 if (search_state->outer_pos.size)
2061 {
2062 search_state->fastmap_chr = ((search_state->outer_pos.search_direction == 1)
2063 ? *(search_state->outer_pos.end - 1)
2064 : *search_state->outer_pos.string);
2065 search_state->fastmap_val
2066 = search_state->fastmap[search_state->fastmap_chr];
2067 search_state->fastmap[search_state->fastmap_chr] = 1;
2068 }
2069 else
2070 {
2071 search_state->fastmap_chr = -1;
2072 search_state->fastmap_val = 0;
2073 }
2074
2075 if (search_state->outer_pos.pos >= search_state->outer_pos.end)
2076 goto fastmap_hit_bound;
2077 else
2078 {
2079 if (search_state->outer_pos.search_direction == 1)
2080 {
2081 if (search_state->fastmap_val)
2082 {
2083 for (;;)
2084 {
2085 while (!search_state->fastmap[*search_state->outer_pos.pos])
2086 ++search_state->outer_pos.pos;
2087 return rx_fastmap_ok;
2088 }
2089 }
2090 else
2091 {
2092 for (;;)
2093 {
2094 while (!search_state->fastmap[*search_state->outer_pos.pos])
2095 ++search_state->outer_pos.pos;
2096 if (*search_state->outer_pos.pos != search_state->fastmap_chr)
2097 return rx_fastmap_ok;
2098 else
2099 {
2100 ++search_state->outer_pos.pos;
2101 if (search_state->outer_pos.pos == search_state->outer_pos.end)
2102 goto fastmap_hit_bound;
2103 }
2104 }
2105 }
2106 }
2107 else
2108 {
2109 __const__ unsigned char * bound;
2110 bound = search_state->outer_pos.string - 1;
2111 if (search_state->fastmap_val)
2112 {
2113 for (;;)
2114 {
2115 while (!search_state->fastmap[*search_state->outer_pos.pos])
2116 --search_state->outer_pos.pos;
2117 return rx_fastmap_ok;
2118 }
2119 }
2120 else
2121 {
2122 for (;;)
2123 {
2124 while (!search_state->fastmap[*search_state->outer_pos.pos])
2125 --search_state->outer_pos.pos;
2126 if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val)
2127 return rx_fastmap_ok;
2128 else
2129 {
2130 --search_state->outer_pos.pos;
2131 if (search_state->outer_pos.pos == bound)
2132 goto fastmap_hit_bound;
2133 }
2134 }
2135 }
2136 }
2137 }
2138
2139 case rx_fastmap_string_break:
2140 fastmap_hit_bound:
2141 {
2142 /* If we hit a bound, it may be time to fetch another burst
2143 * of string, or it may be time to return a continuation to
2144 * the caller, or it might be time to fail.
2145 */
2146
2147 int burst_state;
2148 burst_state = get_burst (&search_state->outer_pos, app_closure, stop);
2149 switch (burst_state)
2150 {
2151 default:
2152 case rx_get_burst_error:
2153 return rx_fastmap_error;
2154 case rx_get_burst_continuation:
2155 {
2156 pc = rx_fastmap_string_break;
2157 goto return_continuation;
2158 }
2159 case rx_get_burst_ok:
2160 goto init_fastmap_sentinal;
2161 case rx_get_burst_no_more:
2162 /* ...not a string split, simply no more string.
2163 *
2164 * When searching backward, running out of string
2165 * is reason to quit.
2166 *
2167 * When searching forward, we allow the possibility
2168 * of an (empty) match after the last character in the
2169 * virtual string. So, fall through to the matcher
2170 */
2171 return ( (search_state->outer_pos.search_direction == 1)
2172 ? rx_fastmap_ok
2173 : rx_fastmap_fail);
2174 }
2175 }
2176 }
2177
2178}
2179
2180
2181
2182
2183#ifdef emacs
2184/* The `emacs' switch turns on certain matching commands
2185 * that make sense only in Emacs.
2186 */
2187#include "sysfuncs.h"
2188#include "lisp.h"
2189#include "buffer.h"
2190#include "syntax.h"
2191#endif /* emacs */
2192
2193/* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar
2194 * packages too.
2195 */
2196#ifdef RX_MEMDBUG
2197#include <malloc.h>
2198#endif /* RX_RX_MEMDBUG */
2199
2200/* We used to test for `BSTRING' here, but only GCC and Emacs define
2201 * `BSTRING', as far as I know, and neither of them use this code.
2202 */
2203#if HAVE_STRING_H || STDC_HEADERS
2204#include <string.h>
2205
2206#ifndef bcmp
2207#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
2208#endif
2209
2210#ifndef bcopy
2211#define bcopy(s, d, n) memcpy ((d), (s), (n))
2212#endif
2213
2214#ifndef bzero
2215#define bzero(s, n) memset ((s), 0, (n))
2216#endif
2217
2218#else /* HAVE_STRING_H || STDC_HEADERS */
2219#include <strings.h>
2220#endif /* not (HAVE_STRING_H || STDC_HEADERS) */
2221
2222#ifdef STDC_HEADERS
2223#include <stdlib.h>
2224#else /* not STDC_HEADERS */
2225char *malloc ();
2226char *realloc ();
2227#endif /* not STDC_HEADERS */
2228
2229
2230
2231
2232
2233/* How many characters in the character set. */
2234#define CHAR_SET_SIZE (1 << CHARBITS)
2235
2236#ifndef emacs
2237/* Define the syntax basics for \<, \>, etc.
2238 * This must be nonzero for the wordchar and notwordchar pattern
2239 * commands in re_match_2.
2240 */
2241#ifndef Sword
2242#define Sword 1
2243#endif
2244#define SYNTAX(c) re_syntax_table[c]
2245#ifdef SYNTAX_TABLE
2246extern char *re_syntax_table;
2247#else
2248RX_DECL char re_syntax_table[CHAR_SET_SIZE];
2249#endif
2250#endif /* not emacs */
2251
2252
2253/* Test if at very beginning or at very end of the virtual concatenation
2254 * of `string1' and `string2'. If only one string, it's `string2'.
2255 */
2256
2257#define AT_STRINGS_BEG() \
2258 ( -1 \
2259 == ((search_state.test_pos.pos - search_state.test_pos.string) \
2260 + search_state.test_pos.offset))
2261
2262#define AT_STRINGS_END() \
2263 ( (total_size - 1) \
2264 == ((search_state.test_pos.pos - search_state.test_pos.string) \
2265 + search_state.test_pos.offset))
2266
2267
2268/* Test if POS + 1 points to a character which is word-constituent. We have
2269 * two special cases to check for: if past the end of string1, look at
2270 * the first character in string2; and if before the beginning of
2271 * string2, look at the last character in string1.
2272 *
2273 * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG ().
2274 */
2275#define LETTER_P(POS,OFF) \
2276 ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \
2277 == Sword)
2278
2279/* Test if the character at D and the one after D differ with respect
2280 * to being word-constituent.
2281 */
2282#define AT_WORD_BOUNDARY(d) \
2283 (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1))
2284
2285
2286#ifdef RX_SUPPORT_CONTINUATIONS
2287#define RX_STACK_ALLOC(BYTES) malloc(BYTES)
2288#define RX_STACK_FREE(MEM) free(MEM)
2289#else
2290#define RX_STACK_ALLOC(BYTES) alloca(BYTES)
2291#define RX_STACK_FREE(MEM) \
2292 ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \
2293 search_state.free_chunks = ((struct rx_stack_chunk *)MEM);
2294
2295#endif
2296
2297#define PUSH(CHUNK_VAR,BYTES) \
2298 if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \
2299 { \
2300 struct rx_stack_chunk * new_chunk; \
2301 if (search_state.free_chunks) \
2302 { \
2303 new_chunk = search_state.free_chunks; \
2304 search_state.free_chunks = search_state.free_chunks->next_chunk; \
2305 } \
2306 else \
2307 { \
2308 new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \
2309 if (!new_chunk) \
2310 { \
2311 search_state.ret_val = 0; \
2312 goto test_do_return; \
2313 } \
2314 } \
2315 new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \
2316 new_chunk->bytes_left = (search_state.chunk_bytes \
2317 - (BYTES) \
2318 - sizeof (struct rx_stack_chunk)); \
2319 new_chunk->next_chunk = CHUNK_VAR; \
2320 CHUNK_VAR = new_chunk; \
2321 } \
2322 else \
2323 (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES))
2324
2325#define POP(CHUNK_VAR,BYTES) \
2326 if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \
2327 { \
2328 struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \
2329 RX_STACK_FREE(CHUNK_VAR); \
2330 CHUNK_VAR = new_chunk; \
2331 } \
2332 else \
2333 (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES)
2334
2335
2336
2337#define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)]
2338
2339
2340
2341
2342
2343#ifdef __STDC__
2344RX_DECL __inline__ int
2345rx_search (struct re_pattern_buffer * rxb,
2346 int startpos,
2347 int range,
2348 int stop,
2349 int total_size,
2350 rx_get_burst_fn get_burst,
2351 rx_back_check_fn back_check,
2352 rx_fetch_char_fn fetch_char,
2353 void * app_closure,
2354 struct re_registers * regs,
2355 struct rx_search_state * resume_state,
2356 struct rx_search_state * save_state)
2357#else
2358RX_DECL __inline__ int
2359rx_search (rxb, startpos, range, stop, total_size,
2360 get_burst, back_check, fetch_char,
2361 app_closure, regs, resume_state, save_state)
2362 struct re_pattern_buffer * rxb;
2363 int startpos;
2364 int range;
2365 int stop;
2366 int total_size;
2367 rx_get_burst_fn get_burst;
2368 rx_back_check_fn back_check;
2369 rx_fetch_char_fn fetch_char;
2370 void * app_closure;
2371 struct re_registers * regs;
2372 struct rx_search_state * resume_state;
2373 struct rx_search_state * save_state;
2374#endif
2375{
2376 int pc;
2377 int test_state;
2378 struct rx_search_state search_state;
2379
2380 search_state.free_chunks = 0;
2381 if (!resume_state)
2382 pc = rx_outer_start;
2383 else
2384 {
2385 search_state = *resume_state;
2386 regs = search_state.saved_regs;
2387 rxb = search_state.saved_rxb;
2388 startpos = search_state.saved_startpos;
2389 range = search_state.saved_range;
2390 stop = search_state.saved_stop;
2391 total_size = search_state.saved_total_size;
2392 get_burst = search_state.saved_get_burst;
2393 back_check = search_state.saved_back_check;
2394 pc = search_state.outer_search_resume_pt;
2395 if (0)
2396 {
2397 return_continuation:
2398 if (save_state)
2399 {
2400 *save_state = search_state;
2401 save_state->saved_regs = regs;
2402 save_state->saved_rxb = rxb;
2403 save_state->saved_startpos = startpos;
2404 save_state->saved_range = range;
2405 save_state->saved_stop = stop;
2406 save_state->saved_total_size = total_size;
2407 save_state->saved_get_burst = get_burst;
2408 save_state->saved_back_check = back_check;
2409 save_state->outer_search_resume_pt = pc;
2410 }
2411 return rx_search_continuation;
2412 }
2413 }
2414
2415 switch (pc)
2416 {
2417 case rx_outer_start:
2418 search_state.ret_val = rx_search_fail;
2419 ( search_state.lparen
2420 = search_state.rparen
2421 = search_state.best_lpspace
2422 = search_state.best_rpspace
2423 = 0);
2424
2425 /* figure the number of registers we may need for use in backreferences.
2426 * the number here includes an element for register zero.
2427 */
2428 search_state.num_regs = rxb->re_nsub + 1;
2429
2430
2431 /* check for out-of-range startpos. */
2432 if ((startpos < 0) || (startpos > total_size))
2433 return rx_search_fail;
2434
2435 /* fix up range if it might eventually take us outside the string. */
2436 {
2437 int endpos;
2438 endpos = startpos + range;
2439 if (endpos < -1)
2440 range = (-1 - startpos);
2441 else if (endpos > (total_size + 1))
2442 range = total_size - startpos;
2443 }
2444
2445 /* if the search isn't to be a backwards one, don't waste time in a
2446 * long search for a pattern that says it is anchored.
2447 */
2448 if (rxb->begbuf_only && (range > 0))
2449 {
2450 if (startpos > 0)
2451 return rx_search_fail;
2452 else
2453 range = 1;
2454 }
2455
2456 /* decide whether to use internal or user-provided reg buffers. */
2457 if (!regs || rxb->no_sub)
2458 {
2459 search_state.best_lpspace =
2460 (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2461 search_state.best_rpspace =
2462 (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2463 search_state.best_lparen = search_state.best_lpspace;
2464 search_state.best_rparen = search_state.best_rpspace;
2465 }
2466 else
2467 {
2468 /* have the register data arrays been allocated? */
2469 if (rxb->regs_allocated == REGS_UNALLOCATED)
2470 { /* no. so allocate them with malloc. we need one
2471 extra element beyond `search_state.num_regs' for the `-1' marker
2472 gnu code uses. */
2473 regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1);
2474 regs->start = ((regoff_t *)
2475 malloc (regs->num_regs * sizeof ( regoff_t)));
2476 regs->end = ((regoff_t *)
2477 malloc (regs->num_regs * sizeof ( regoff_t)));
2478 if (regs->start == 0 || regs->end == 0)
2479 return rx_search_error;
2480 rxb->regs_allocated = REGS_REALLOCATE;
2481 }
2482 else if (rxb->regs_allocated == REGS_REALLOCATE)
2483 { /* yes. if we need more elements than were already
2484 allocated, reallocate them. if we need fewer, just
2485 leave it alone. */
2486 if (regs->num_regs < search_state.num_regs + 1)
2487 {
2488 regs->num_regs = search_state.num_regs + 1;
2489 regs->start = ((regoff_t *)
2490 realloc (regs->start,
2491 regs->num_regs * sizeof (regoff_t)));
2492 regs->end = ((regoff_t *)
2493 realloc (regs->end,
2494 regs->num_regs * sizeof ( regoff_t)));
2495 if (regs->start == 0 || regs->end == 0)
2496 return rx_search_error;
2497 }
2498 }
2499 else if (rxb->regs_allocated != REGS_FIXED)
2500 return rx_search_error;
2501
2502 if (regs->num_regs < search_state.num_regs + 1)
2503 {
2504 search_state.best_lpspace =
2505 ((regoff_t *)
2506 REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
2507 search_state.best_rpspace =
2508 ((regoff_t *)
2509 REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
2510 search_state.best_lparen = search_state.best_lpspace;
2511 search_state.best_rparen = search_state.best_rpspace;
2512 }
2513 else
2514 {
2515 search_state.best_lparen = regs->start;
2516 search_state.best_rparen = regs->end;
2517 }
2518 }
2519
2520 search_state.lparen =
2521 (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2522 search_state.rparen =
2523 (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2524
2525 if (! ( search_state.best_rparen
2526 && search_state.best_lparen
2527 && search_state.lparen && search_state.rparen))
2528 return rx_search_error;
2529
2530 search_state.best_last_l = search_state.best_last_r = -1;
2531
2532 search_state.translate = (rxb->translate
2533 ? rxb->translate
2534 : rx_id_translation);
2535
2536
2537
2538 /*
2539 * two nfa's were compiled.
2540 * `0' is complete.
2541 * `1' faster but gets registers wrong and ends too soon.
2542 */
2543 search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1';
2544
2545 /* we have the option to look for the best match or the first
2546 * one we can find. if the user isn't asking for register information,
2547 * we don't need to find the best match.
2548 */
2549 search_state.first_found = !regs;
2550
2551 if (range >= 0)
2552 {
2553 search_state.outer_pos.search_end = startpos + range;
2554 search_state.outer_pos.search_direction = 1;
2555 }
2556 else
2557 {
2558 search_state.outer_pos.search_end = startpos + range;
2559 search_state.outer_pos.search_direction = -1;
2560 }
2561
2562 /* the vacuous search always turns up nothing. */
2563 if ((search_state.outer_pos.search_direction == 1)
2564 ? (startpos > search_state.outer_pos.search_end)
2565 : (startpos < search_state.outer_pos.search_end))
2566 return rx_search_fail;
2567
2568 /* now we build the starting state of the supernfa. */
2569 {
2570 struct rx_superset * start_contents;
2571 struct rx_nfa_state_set * start_nfa_set;
2572
2573 /* we presume here that the nfa start state has only one
2574 * possible future with no side effects.
2575 */
2576 start_nfa_set = rxb->start->futures->destset;
2577 if ( rxb->rx.start_set
2578 && (rxb->rx.start_set->starts_for == &rxb->rx))
2579 start_contents = rxb->rx.start_set;
2580 else
2581 {
2582 start_contents =
2583 rx_superstate_eclosure_union (&rxb->rx,
2584 rx_superset_cons (&rxb->rx, 0, 0),
2585 start_nfa_set);
2586
2587 if (!start_contents)
2588 return rx_search_fail;
2589
2590 start_contents->starts_for = &rxb->rx;
2591 rxb->rx.start_set = start_contents;
2592 }
2593 if ( start_contents->superstate
2594 && (start_contents->superstate->rx_id == rxb->rx.rx_id))
2595 {
2596 search_state.start_super = start_contents->superstate;
2597 rx_lock_superstate (&rxb->rx, search_state.start_super);
2598 }
2599 else
2600 {
2601 rx_protect_superset (&rxb->rx, start_contents);
2602
2603 search_state.start_super = rx_superstate (&rxb->rx, start_contents);
2604 if (!search_state.start_super)
2605 return rx_search_fail;
2606 rx_lock_superstate (&rxb->rx, search_state.start_super);
2607 rx_release_superset (&rxb->rx, start_contents);
2608 }
2609 }
2610
2611
2612 /* The outer_pos tracks the position within the strings
2613 * as seen by loop that calls fastmap_search.
2614 *
2615 * The caller supplied get_burst function actually
2616 * gives us pointers to chars.
2617 *
2618 * Communication with the get_burst function is through an
2619 * rx_string_position structure. Here, the structure for
2620 * outer_pos is initialized. It is set to point to the
2621 * NULL string, at an offset of STARTPOS. STARTPOS is out
2622 * of range of the NULL string, so the first call to
2623 * getburst will patch up the rx_string_position to point
2624 * to valid characters.
2625 */
2626
2627 ( search_state.outer_pos.string
2628 = search_state.outer_pos.end
2629 = 0);
2630
2631 search_state.outer_pos.offset = 0;
2632 search_state.outer_pos.size = 0;
2633 search_state.outer_pos.pos = (unsigned char *)startpos;
2634 init_fastmap (rxb, &search_state);
2635
2636 search_state.fastmap_resume_pt = rx_fastmap_start;
2637 case rx_outer_fastmap:
2638 /* do { */
2639 pseudo_do:
2640 {
2641 {
2642 int fastmap_state;
2643 fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure,
2644 &search_state);
2645 switch (fastmap_state)
2646 {
2647 case rx_fastmap_continuation:
2648 pc = rx_outer_fastmap;
2649 goto return_continuation;
2650 case rx_fastmap_fail:
2651 goto finish;
2652 case rx_fastmap_ok:
2653 break;
2654 }
2655 }
2656
2657 /* now the fastmap loop has brought us to a plausible
2658 * starting point for a match. so, it's time to run the
2659 * nfa and see if a match occured.
2660 */
2661 startpos = ( search_state.outer_pos.pos
2662 - search_state.outer_pos.string
2663 + search_state.outer_pos.offset);
2664/*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end))
2665/*|*/ goto finish;
2666 }
2667
2668 search_state.test_match_resume_pt = rx_test_start;
2669 /* do interrupted for entry point... */
2670 case rx_outer_test:
2671 /* ...do continued */
2672 {
2673 goto test_match;
2674 test_returns_to_search:
2675 switch (test_state)
2676 {
2677 case rx_test_continuation:
2678 pc = rx_outer_test;
2679 goto return_continuation;
2680 case rx_test_error:
2681 search_state.ret_val = rx_search_error;
2682 goto finish;
2683 case rx_test_fail:
2684 break;
2685 case rx_test_ok:
2686 goto finish;
2687 }
2688 search_state.outer_pos.pos += search_state.outer_pos.search_direction;
2689 startpos += search_state.outer_pos.search_direction;
2690#if 0
2691/*|*/ if (search_state.test_pos.pos < search_state.test_pos.end)
2692/*|*/ break;
2693#endif
2694 }
2695 /* do interrupted for entry point... */
2696 case rx_outer_restore_pos:
2697 {
2698 int x;
2699 x = get_burst (&search_state.outer_pos, app_closure, stop);
2700 switch (x)
2701 {
2702 case rx_get_burst_continuation:
2703 pc = rx_outer_restore_pos;
2704 goto return_continuation;
2705 case rx_get_burst_error:
2706 search_state.ret_val = rx_search_error;
2707 goto finish;
2708 case rx_get_burst_no_more:
2709 if (rxb->can_match_empty)
2710 break;
2711 goto finish;
2712 case rx_get_burst_ok:
2713 break;
2714 }
2715 } /* } while (...see below...) */
2716
2717 if ((search_state.outer_pos.search_direction == 1)
2718 ? (startpos < search_state.outer_pos.search_end)
2719 : (startpos > search_state.outer_pos.search_end))
2720 goto pseudo_do;
2721
2722
2723 finish:
2724 uninit_fastmap (rxb, &search_state);
2725 if (search_state.start_super)
2726 rx_unlock_superstate (&rxb->rx, search_state.start_super);
2727
2728#ifdef regex_malloc
2729 if (search_state.lparen) free (search_state.lparen);
2730 if (search_state.rparen) free (search_state.rparen);
2731 if (search_state.best_lpspace) free (search_state.best_lpspace);
2732 if (search_state.best_rpspace) free (search_state.best_rpspace);
2733#endif
2734 return search_state.ret_val;
2735 }
2736
2737
2738 test_match:
2739 {
2740 enum rx_test_match_entry test_pc;
2741 int inx;
2742 test_pc = search_state.test_match_resume_pt;
2743 if (test_pc == rx_test_start)
2744 {
2745#ifdef RX_DEBUG
2746 search_state.backtrack_depth = 0;
2747#endif
2748 search_state.last_l = search_state.last_r = 0;
2749 search_state.lparen[0] = startpos;
2750 search_state.super = search_state.start_super;
2751 search_state.c = search_state.nfa_choice;
2752 search_state.test_pos.pos = search_state.outer_pos.pos - 1;
2753 search_state.test_pos.string = search_state.outer_pos.string;
2754 search_state.test_pos.end = search_state.outer_pos.end;
2755 search_state.test_pos.offset = search_state.outer_pos.offset;
2756 search_state.test_pos.size = search_state.outer_pos.size;
2757 search_state.test_pos.search_direction = 1;
2758 search_state.counter_stack = 0;
2759 search_state.backtrack_stack = 0;
2760 search_state.backtrack_frame_bytes =
2761 (sizeof (struct rx_backtrack_frame)
2762 + (rxb->match_regs_on_stack
2763 ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2
2764 : 0));
2765 search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64;
2766 search_state.test_ret = rx_test_line_finished;
2767 search_state.could_have_continued = 0;
2768 }
2769 /* This is while (1)...except that the body of the loop is interrupted
2770 * by some alternative entry points.
2771 */
2772 pseudo_while_1:
2773 switch (test_pc)
2774 {
2775 case rx_test_cache_hit_loop:
2776 goto resume_continuation_1;
2777 case rx_test_backreference_check:
2778 goto resume_continuation_2;
2779 case rx_test_backtrack_return:
2780 goto resume_continuation_3;
2781 case rx_test_start:
2782#ifdef RX_DEBUG
2783 /* There is a search tree with every node as set of deterministic
2784 * transitions in the super nfa. For every branch of a
2785 * backtrack point is an edge in the tree.
2786 * This counts up a pre-order of nodes in that tree.
2787 * It's saved on the search stack and printed when debugging.
2788 */
2789 search_state.line_no = 0;
2790 search_state.lines_found = 0;
2791#endif
2792
2793 top_of_cycle:
2794 /* A superstate is basicly a transition table, indexed by
2795 * characters from the string being tested, and containing
2796 * RX_INX (`instruction frame') structures.
2797 */
2798 search_state.ifr = &search_state.super->transitions [search_state.c];
2799
2800 recurse_test_match:
2801 /* This is the point to which control is sent when the
2802 * test matcher `recurses'. Before jumping here, some variables
2803 * need to be saved on the stack and the next instruction frame
2804 * has to be computed.
2805 */
2806
2807 restart:
2808 /* Some instructions don't advance the matcher, but just
2809 * carry out some side effects and fetch a new instruction.
2810 * To dispatch that new instruction, `goto restart'.
2811 */
2812
2813 {
2814 struct rx_inx * next_tr_table;
2815 struct rx_inx * this_tr_table;
2816 /* The fastest route through the loop is when the instruction
2817 * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA
2818 * is non-zero. In that case, it points to the next
2819 * superstate.
2820 *
2821 * This allows us to not bother fetching the bytecode.
2822 */
2823 next_tr_table = (struct rx_inx *)search_state.ifr->data;
2824 this_tr_table = search_state.super->transitions;
2825 while (next_tr_table)
2826 {
2827#ifdef RX_DEBUG_0
2828 if (rx_debug_trace)
2829 {
2830 struct rx_superset * setp;
2831
2832 fprintf (stderr, "%d %d>> re_next_char @ %d (%d)",
2833 search_state.line_no,
2834 search_state.backtrack_depth,
2835 (search_state.test_pos.pos - search_state.test_pos.string
2836 + search_state.test_pos.offset), search_state.c);
2837
2838 search_state.super =
2839 ((struct rx_superstate *)
2840 ((char *)this_tr_table
2841 - ((unsigned long)
2842 ((struct rx_superstate *)0)->transitions)));
2843
2844 setp = search_state.super->contents;
2845 fprintf (stderr, " superstet (rx=%d, &=%x: ",
2846 rxb->rx.rx_id, setp);
2847 while (setp)
2848 {
2849 fprintf (stderr, "%d ", setp->id);
2850 setp = setp->cdr;
2851 }
2852 fprintf (stderr, "\n");
2853 }
2854#endif
2855 this_tr_table = next_tr_table;
2856 ++search_state.test_pos.pos;
2857 if (search_state.test_pos.pos == search_state.test_pos.end)
2858 {
2859 int burst_state;
2860 try_burst_1:
2861 burst_state = get_burst (&search_state.test_pos,
2862 app_closure, stop);
2863 switch (burst_state)
2864 {
2865 case rx_get_burst_continuation:
2866 search_state.saved_this_tr_table = this_tr_table;
2867 search_state.saved_next_tr_table = next_tr_table;
2868 test_pc = rx_test_cache_hit_loop;
2869 goto test_return_continuation;
2870
2871 resume_continuation_1:
2872 /* Continuation one jumps here to do its work: */
2873 search_state.saved_this_tr_table = this_tr_table;
2874 search_state.saved_next_tr_table = next_tr_table;
2875 goto try_burst_1;
2876
2877 case rx_get_burst_ok:
2878 /* get_burst succeeded...keep going */
2879 break;
2880
2881 case rx_get_burst_no_more:
2882 search_state.test_ret = rx_test_line_finished;
2883 search_state.could_have_continued = 1;
2884 goto test_do_return;
2885
2886 case rx_get_burst_error:
2887 /* An error... */
2888 search_state.test_ret = rx_test_internal_error;
2889 goto test_do_return;
2890 }
2891 }
2892 search_state.c = *search_state.test_pos.pos;
2893 search_state.ifr = this_tr_table + search_state.c;
2894 next_tr_table = (struct rx_inx *)search_state.ifr->data;
2895 } /* Fast loop through cached transition tables */
2896
2897 /* Here when we ran out of cached next-char transitions.
2898 * So, it will be necessary to do a more expensive
2899 * dispatch on the current instruction. The superstate
2900 * pointer is allowed to become invalid during next-char
2901 * transitions -- now we must bring it up to date.
2902 */
2903 search_state.super =
2904 ((struct rx_superstate *)
2905 ((char *)this_tr_table
2906 - ((unsigned long)
2907 ((struct rx_superstate *)0)->transitions)));
2908 }
2909
2910 /* We've encountered an instruction other than next-char.
2911 * Dispatch that instruction:
2912 */
2913 inx = (int)search_state.ifr->inx;
2914#ifdef RX_DEBUG_0
2915 if (rx_debug_trace)
2916 {
2917 struct rx_superset * setp = search_state.super->contents;
2918
2919 fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no,
2920 search_state.backtrack_depth,
2921 inx_names[inx],
2922 (search_state.test_pos.pos - search_state.test_pos.string
2923 + (test_pos.half == 0 ? 0 : size1)), search_state.c);
2924
2925 fprintf (stderr, " superstet (rx=%d, &=%x: ",
2926 rxb->rx.rx_id, setp);
2927 while (setp)
2928 {
2929 fprintf (stderr, "%d ", setp->id);
2930 setp = setp->cdr;
2931 }
2932 fprintf (stderr, "\n");
2933 }
2934#endif
2935 switch ((enum rx_opcode)inx)
2936 {
2937 case rx_do_side_effects:
2938
2939 /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon
2940 * edges associated with parentheses, backreferencing, etc.
2941 */
2942 {
2943 struct rx_distinct_future * df =
2944 (struct rx_distinct_future *)search_state.ifr->data_2;
2945 struct rx_se_list * el = df->effects;
2946 /* Side effects come in lists. This walks down
2947 * a list, dispatching.
2948 */
2949 while (el)
2950 {
2951 long effect;
2952 effect = (long)el->car;
2953 if (effect < 0)
2954 {
2955#ifdef RX_DEBUG_0
2956 if (rx_debug_trace)
2957 {
2958 struct rx_superset * setp = search_state.super->contents;
2959
2960 fprintf (stderr, "....%d %d>> %s\n", search_state.line_no,
2961 search_state.backtrack_depth,
2962 efnames[-effect]);
2963 }
2964#endif
2965 switch ((enum re_side_effects) effect)
2966
2967 {
2968 case re_se_pushback:
2969 search_state.ifr = &df->future_frame;
2970 if (!search_state.ifr->data)
2971 {
2972 struct rx_superstate * sup;
2973 sup = search_state.super;
2974 rx_lock_superstate (rx, sup);
2975 if (!rx_handle_cache_miss (&rxb->rx,
2976 search_state.super,
2977 search_state.c,
2978 (search_state.ifr
2979 ->data_2)))
2980 {
2981 rx_unlock_superstate (rx, sup);
2982 search_state.test_ret = rx_test_internal_error;
2983 goto test_do_return;
2984 }
2985 rx_unlock_superstate (rx, sup);
2986 }
2987 /* --search_state.test_pos.pos; */
2988 search_state.c = 't';
2989 search_state.super
2990 = ((struct rx_superstate *)
2991 ((char *)search_state.ifr->data
2992 - (long)(((struct rx_superstate *)0)
2993 ->transitions)));
2994 goto top_of_cycle;
2995 break;
2996 case re_se_push0:
2997 {
2998 struct rx_counter_frame * old_cf
2999 = (search_state.counter_stack
3000 ? ((struct rx_counter_frame *)
3001 search_state.counter_stack->sp)
3002 : 0);
3003 struct rx_counter_frame * cf;
3004 PUSH (search_state.counter_stack,
3005 sizeof (struct rx_counter_frame));
3006 cf = ((struct rx_counter_frame *)
3007 search_state.counter_stack->sp);
3008 cf->tag = re_se_iter;
3009 cf->val = 0;
3010 cf->inherited_from = 0;
3011 cf->cdr = old_cf;
3012 break;
3013 }
3014 case re_se_fail:
3015 goto test_do_return;
3016 case re_se_begbuf:
3017 if (!AT_STRINGS_BEG ())
3018 goto test_do_return;
3019 break;
3020 case re_se_endbuf:
3021 if (!AT_STRINGS_END ())
3022 goto test_do_return;
3023 break;
3024 case re_se_wordbeg:
3025 if ( LETTER_P (&search_state.test_pos, 1)
3026 && ( AT_STRINGS_BEG()
3027 || !LETTER_P (&search_state.test_pos, 0)))
3028 break;
3029 else
3030 goto test_do_return;
3031 case re_se_wordend:
3032 if ( !AT_STRINGS_BEG ()
3033 && LETTER_P (&search_state.test_pos, 0)
3034 && (AT_STRINGS_END ()
3035 || !LETTER_P (&search_state.test_pos, 1)))
3036 break;
3037 else
3038 goto test_do_return;
3039 case re_se_wordbound:
3040 if (AT_WORD_BOUNDARY (&search_state.test_pos))
3041 break;
3042 else
3043 goto test_do_return;
3044 case re_se_notwordbound:
3045 if (!AT_WORD_BOUNDARY (&search_state.test_pos))
3046 break;
3047 else
3048 goto test_do_return;
3049 case re_se_hat:
3050 if (AT_STRINGS_BEG ())
3051 {
3052 if (rxb->not_bol)
3053 goto test_do_return;
3054 else
3055 break;
3056 }
3057 else
3058 {
3059 char pos_c = *search_state.test_pos.pos;
3060 if ( (SRCH_TRANSLATE (pos_c)
3061 == SRCH_TRANSLATE('\n'))
3062 && rxb->newline_anchor)
3063 break;
3064 else
3065 goto test_do_return;
3066 }
3067 case re_se_dollar:
3068 if (AT_STRINGS_END ())
3069 {
3070 if (rxb->not_eol)
3071 goto test_do_return;
3072 else
3073 break;
3074 }
3075 else
3076 {
3077 if ( ( SRCH_TRANSLATE (fetch_char
3078 (&search_state.test_pos, 1,
3079 app_closure, stop))
3080 == SRCH_TRANSLATE ('\n'))
3081 && rxb->newline_anchor)
3082 break;
3083 else
3084 goto test_do_return;
3085 }
3086
3087 case re_se_try:
3088 /* This is the first side effect in every
3089 * expression.
3090 *
3091 * FOR NO GOOD REASON...get rid of it...
3092 */
3093 break;
3094
3095 case re_se_pushpos:
3096 {
3097 int urhere =
3098 ((int)(search_state.test_pos.pos
3099 - search_state.test_pos.string)
3100 + search_state.test_pos.offset);
3101 struct rx_counter_frame * old_cf
3102 = (search_state.counter_stack
3103 ? ((struct rx_counter_frame *)
3104 search_state.counter_stack->sp)
3105 : 0);
3106 struct rx_counter_frame * cf;
3107 PUSH(search_state.counter_stack,
3108 sizeof (struct rx_counter_frame));
3109 cf = ((struct rx_counter_frame *)
3110 search_state.counter_stack->sp);
3111 cf->tag = re_se_pushpos;
3112 cf->val = urhere;
3113 cf->inherited_from = 0;
3114 cf->cdr = old_cf;
3115 break;
3116 }
3117
3118 case re_se_chkpos:
3119 {
3120 int urhere =
3121 ((int)(search_state.test_pos.pos
3122 - search_state.test_pos.string)
3123 + search_state.test_pos.offset);
3124 struct rx_counter_frame * cf
3125 = ((struct rx_counter_frame *)
3126 search_state.counter_stack->sp);
3127 if (cf->val == urhere)
3128 goto test_do_return;
3129 cf->val = urhere;
3130 break;
3131 }
3132 break;
3133
3134 case re_se_poppos:
3135 POP(search_state.counter_stack,
3136 sizeof (struct rx_counter_frame));
3137 break;
3138
3139
3140 case re_se_at_dot:
3141 case re_se_syntax:
3142 case re_se_not_syntax:
3143#ifdef emacs
3144 /*
3145 * this release lacks emacs support
3146 */
3147#endif
3148 break;
3149 case re_se_win:
3150 case re_se_lparen:
3151 case re_se_rparen:
3152 case re_se_backref:
3153 case re_se_iter:
3154 case re_se_end_iter:
3155 case re_se_tv:
3156 case re_floogle_flap:
3157 search_state.ret_val = 0;
3158 goto test_do_return;
3159 }
3160 }
3161 else
3162 {
3163#ifdef RX_DEBUG_0
3164 if (rx_debug_trace)
3165 fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no,
3166 search_state.backtrack_depth,
3167 efnames2[rxb->se_params [effect].se],
3168 rxb->se_params [effect].op1,
3169 rxb->se_params [effect].op2);
3170#endif
3171 switch (rxb->se_params [effect].se)
3172 {
3173 case re_se_win:
3174 /* This side effect indicates that we've
3175 * found a match, though not necessarily the
3176 * best match. This is a fancy assignment to
3177 * register 0 unless the caller didn't
3178 * care about registers. In which case,
3179 * this stops the match.
3180 */
3181 {
3182 int urhere =
3183 ((int)(search_state.test_pos.pos
3184 - search_state.test_pos.string)
3185 + search_state.test_pos.offset);
3186
3187 if ( (search_state.best_last_r < 0)
3188 || (urhere + 1 > search_state.best_rparen[0]))
3189 {
3190 /* Record the best known and keep
3191 * looking.
3192 */
3193 int x;
3194 for (x = 0; x <= search_state.last_l; ++x)
3195 search_state.best_lparen[x] = search_state.lparen[x];
3196 search_state.best_last_l = search_state.last_l;
3197 for (x = 0; x <= search_state.last_r; ++x)
3198 search_state.best_rparen[x] = search_state.rparen[x];
3199 search_state.best_rparen[0] = urhere + 1;
3200 search_state.best_last_r = search_state.last_r;
3201 }
3202 /* If we're not reporting the match-length
3203 * or other register info, we need look no
3204 * further.
3205 */
3206 if (search_state.first_found)
3207 {
3208 search_state.test_ret = rx_test_found_first;
3209 goto test_do_return;
3210 }
3211 }
3212 break;
3213 case re_se_lparen:
3214 {
3215 int urhere =
3216 ((int)(search_state.test_pos.pos
3217 - search_state.test_pos.string)
3218 + search_state.test_pos.offset);
3219
3220 int reg = rxb->se_params [effect].op1;
3221#if 0
3222 if (reg > search_state.last_l)
3223#endif
3224 {
3225 search_state.lparen[reg] = urhere + 1;
3226 /* In addition to making this assignment,
3227 * we now know that lower numbered regs
3228 * that haven't already been assigned,
3229 * won't be. We make sure they're
3230 * filled with -1, so they can be
3231 * recognized as unassigned.
3232 */
3233 if (search_state.last_l < reg)
3234 while (++search_state.last_l < reg)
3235 search_state.lparen[search_state.last_l] = -1;
3236 }
3237 break;
3238 }
3239
3240 case re_se_rparen:
3241 {
3242 int urhere =
3243 ((int)(search_state.test_pos.pos
3244 - search_state.test_pos.string)
3245 + search_state.test_pos.offset);
3246 int reg = rxb->se_params [effect].op1;
3247 search_state.rparen[reg] = urhere + 1;
3248 if (search_state.last_r < reg)
3249 {
3250 while (++search_state.last_r < reg)
3251 search_state.rparen[search_state.last_r]
3252 = -1;
3253 }
3254 break;
3255 }
3256
3257 case re_se_backref:
3258 {
3259 int reg = rxb->se_params [effect].op1;
3260 if ( reg > search_state.last_r
3261 || search_state.rparen[reg] < 0)
3262 goto test_do_return;
3263
3264 {
3265 int backref_status;
3266 check_backreference:
3267 backref_status
3268 = back_check (&search_state.test_pos,
3269 search_state.lparen[reg],
3270 search_state.rparen[reg],
3271 search_state.translate,
3272 app_closure,
3273 stop);
3274 switch (backref_status)
3275 {
3276 case rx_back_check_continuation:
3277 search_state.saved_reg = reg;
3278 test_pc = rx_test_backreference_check;
3279 goto test_return_continuation;
3280 resume_continuation_2:
3281 reg = search_state.saved_reg;
3282 goto check_backreference;
3283 case rx_back_check_fail:
3284 /* Fail */
3285 goto test_do_return;
3286 case rx_back_check_pass:
3287 /* pass --
3288 * test_pos now advanced to last
3289 * char matched by backref
3290 */
3291 break;
3292 }
3293 }
3294 break;
3295 }
3296 case re_se_iter:
3297 {
3298 struct rx_counter_frame * csp
3299 = ((struct rx_counter_frame *)
3300 search_state.counter_stack->sp);
3301 if (csp->val == rxb->se_params[effect].op2)
3302 goto test_do_return;
3303 else
3304 ++csp->val;
3305 break;
3306 }
3307 case re_se_end_iter:
3308 {
3309 struct rx_counter_frame * csp
3310 = ((struct rx_counter_frame *)
3311 search_state.counter_stack->sp);
3312 if (csp->val < rxb->se_params[effect].op1)
3313 goto test_do_return;
3314 else
3315 {
3316 struct rx_counter_frame * source = csp;
3317 while (source->inherited_from)
3318 source = source->inherited_from;
3319 if (!source || !source->cdr)
3320 {
3321 POP(search_state.counter_stack,
3322 sizeof(struct rx_counter_frame));
3323 }
3324 else
3325 {
3326 source = source->cdr;
3327 csp->val = source->val;
3328 csp->tag = source->tag;
3329 csp->cdr = 0;
3330 csp->inherited_from = source;
3331 }
3332 }
3333 break;
3334 }
3335 case re_se_tv:
3336 /* is a noop */
3337 break;
3338 case re_se_try:
3339 case re_se_pushback:
3340 case re_se_push0:
3341 case re_se_pushpos:
3342 case re_se_chkpos:
3343 case re_se_poppos:
3344 case re_se_at_dot:
3345 case re_se_syntax:
3346 case re_se_not_syntax:
3347 case re_se_begbuf:
3348 case re_se_hat:
3349 case re_se_wordbeg:
3350 case re_se_wordbound:
3351 case re_se_notwordbound:
3352 case re_se_wordend:
3353 case re_se_endbuf:
3354 case re_se_dollar:
3355 case re_se_fail:
3356 case re_floogle_flap:
3357 search_state.ret_val = 0;
3358 goto test_do_return;
3359 }
3360 }
3361 el = el->cdr;
3362 }
3363 /* Now the side effects are done,
3364 * so get the next instruction.
3365 * and move on.
3366 */
3367 search_state.ifr = &df->future_frame;
3368 goto restart;
3369 }
3370
3371 case rx_backtrack_point:
3372 {
3373 /* A backtrack point indicates that we've reached a
3374 * non-determinism in the superstate NFA. This is a
3375 * loop that exhaustively searches the possibilities.
3376 *
3377 * A backtracking strategy is used. We keep track of what
3378 * registers are valid so we can erase side effects.
3379 *
3380 * First, make sure there is some stack space to hold
3381 * our state.
3382 */
3383
3384 struct rx_backtrack_frame * bf;
3385
3386 PUSH(search_state.backtrack_stack,
3387 search_state.backtrack_frame_bytes);
3388#ifdef RX_DEBUG_0
3389 ++search_state.backtrack_depth;
3390#endif
3391
3392 bf = ((struct rx_backtrack_frame *)
3393 search_state.backtrack_stack->sp);
3394 {
3395 bf->stk_super = search_state.super;
3396 /* We prevent the current superstate from being
3397 * deleted from the superstate cache.
3398 */
3399 rx_lock_superstate (&rxb->rx, search_state.super);
3400#ifdef RX_DEBUG_0
3401 bf->stk_search_state.line_no = search_state.line_no;
3402#endif
3403 bf->stk_c = search_state.c;
3404 bf->stk_test_pos = search_state.test_pos;
3405 bf->stk_last_l = search_state.last_l;
3406 bf->stk_last_r = search_state.last_r;
3407 bf->df = ((struct rx_super_edge *)
3408 search_state.ifr->data_2)->options;
3409 bf->first_df = bf->df;
3410 bf->counter_stack_sp = (search_state.counter_stack
3411 ? search_state.counter_stack->sp
3412 : 0);
3413 bf->stk_test_ret = search_state.test_ret;
3414 if (rxb->match_regs_on_stack)
3415 {
3416 int x;
3417 regoff_t * stk =
3418 (regoff_t *)((char *)bf + sizeof (*bf));
3419 for (x = 0; x <= search_state.last_l; ++x)
3420 stk[x] = search_state.lparen[x];
3421 stk += x;
3422 for (x = 0; x <= search_state.last_r; ++x)
3423 stk[x] = search_state.rparen[x];
3424 }
3425 }
3426
3427 /* Here is a while loop whose body is mainly a function
3428 * call and some code to handle a return from that
3429 * function.
3430 *
3431 * From here on for the rest of `case backtrack_point' it
3432 * is unsafe to assume that the search_state copies of
3433 * variables saved on the backtracking stack are valid
3434 * -- so read their values from the backtracking stack.
3435 *
3436 * This lets us use one generation fewer stack saves in
3437 * the call-graph of a search.
3438 */
3439
3440 while_non_det_options:
3441#ifdef RX_DEBUG_0
3442 ++search_state.lines_found;
3443 if (rx_debug_trace)
3444 fprintf (stderr, "@@@ %d calls %d @@@\n",
3445 search_state.line_no, search_state.lines_found);
3446
3447 search_state.line_no = search_state.lines_found;
3448#endif
3449
3450 if (bf->df->next_same_super_edge[0] == bf->first_df)
3451 {
3452 /* This is a tail-call optimization -- we don't recurse
3453 * for the last of the possible futures.
3454 */
3455 search_state.ifr = (bf->df->effects
3456 ? &bf->df->side_effects_frame
3457 : &bf->df->future_frame);
3458
3459 rx_unlock_superstate (&rxb->rx, search_state.super);
3460 POP(search_state.backtrack_stack,
3461 search_state.backtrack_frame_bytes);
3462#ifdef RX_DEBUG
3463 --search_state.backtrack_depth;
3464#endif
3465 goto restart;
3466 }
3467 else
3468 {
3469 if (search_state.counter_stack)
3470 {
3471 struct rx_counter_frame * old_cf
3472 = ((struct rx_counter_frame *)search_state.counter_stack->sp);
3473 struct rx_counter_frame * cf;
3474 PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame));
3475 cf = ((struct rx_counter_frame *)search_state.counter_stack->sp);
3476 cf->tag = old_cf->tag;
3477 cf->val = old_cf->val;
3478 cf->inherited_from = old_cf;
3479 cf->cdr = 0;
3480 }
3481 /* `Call' this test-match block */
3482 search_state.ifr = (bf->df->effects
3483 ? &bf->df->side_effects_frame
3484 : &bf->df->future_frame);
3485 goto recurse_test_match;
3486 }
3487
3488 /* Returns in this block are accomplished by
3489 * goto test_do_return. There are two cases.
3490 * If there is some search-stack left,
3491 * then it is a return from a `recursive' call.
3492 * If there is no search-stack left, then
3493 * we should return to the fastmap/search loop.
3494 */
3495
3496 test_do_return:
3497
3498 if (!search_state.backtrack_stack)
3499 {
3500#ifdef RX_DEBUG_0
3501 if (rx_debug_trace)
3502 fprintf (stderr, "!!! %d bails returning %d !!!\n",
3503 search_state.line_no, search_state.test_ret);
3504#endif
3505
3506 /* No more search-stack -- this test is done. */
3507 if (search_state.test_ret)
3508 goto return_from_test_match;
3509 else
3510 goto error_in_testing_match;
3511 }
3512
3513 /* Returning from a recursive call to
3514 * the test match block:
3515 */
3516
3517 bf = ((struct rx_backtrack_frame *)
3518 search_state.backtrack_stack->sp);
3519#ifdef RX_DEBUG_0
3520 if (rx_debug_trace)
3521 fprintf (stderr, "+++ %d returns %d (to %d)+++\n",
3522 search_state.line_no,
3523 search_state.test_ret,
3524 bf->stk_search_state.line_no);
3525#endif
3526
3527 while (search_state.counter_stack
3528 && (!bf->counter_stack_sp
3529 || (bf->counter_stack_sp
3530 != search_state.counter_stack->sp)))
3531 {
3532 POP(search_state.counter_stack,
3533 sizeof (struct rx_counter_frame));
3534 }
3535
3536 if (search_state.test_ret == rx_test_error)
3537 {
3538 POP (search_state.backtrack_stack,
3539 search_state.backtrack_frame_bytes);
3540 goto test_do_return;
3541 }
3542
3543 /* If a non-longest match was found and that is good
3544 * enough, return immediately.
3545 */
3546 if ( (search_state.test_ret == rx_test_found_first)
3547 && search_state.first_found)
3548 {
3549 rx_unlock_superstate (&rxb->rx, bf->stk_super);
3550 POP (search_state.backtrack_stack,
3551 search_state.backtrack_frame_bytes);
3552 goto test_do_return;
3553 }
3554
3555 search_state.test_ret = bf->stk_test_ret;
3556 search_state.last_l = bf->stk_last_l;
3557 search_state.last_r = bf->stk_last_r;
3558 bf->df = bf->df->next_same_super_edge[0];
3559 search_state.super = bf->stk_super;
3560 search_state.c = bf->stk_c;
3561#ifdef RX_DEBUG_0
3562 search_state.line_no = bf->stk_search_state.line_no;
3563#endif
3564
3565 if (rxb->match_regs_on_stack)
3566 {
3567 int x;
3568 regoff_t * stk =
3569 (regoff_t *)((char *)bf + sizeof (*bf));
3570 for (x = 0; x <= search_state.last_l; ++x)
3571 search_state.lparen[x] = stk[x];
3572 stk += x;
3573 for (x = 0; x <= search_state.last_r; ++x)
3574 search_state.rparen[x] = stk[x];
3575 }
3576
3577 {
3578 int x;
3579 try_burst_2:
3580 x = get_burst (&bf->stk_test_pos, app_closure, stop);
3581 switch (x)
3582 {
3583 case rx_get_burst_continuation:
3584 search_state.saved_bf = bf;
3585 test_pc = rx_test_backtrack_return;
3586 goto test_return_continuation;
3587 resume_continuation_3:
3588 bf = search_state.saved_bf;
3589 goto try_burst_2;
3590 case rx_get_burst_no_more:
3591 /* Since we've been here before, it is some kind of
3592 * error that we can't return.
3593 */
3594 case rx_get_burst_error:
3595 search_state.test_ret = rx_test_internal_error;
3596 goto test_do_return;
3597 case rx_get_burst_ok:
3598 break;
3599 }
3600 }
3601 search_state.test_pos = bf->stk_test_pos;
3602 goto while_non_det_options;
3603 }
3604
3605
3606 case rx_cache_miss:
3607 /* Because the superstate NFA is lazily constructed,
3608 * and in fact may erode from underneath us, we sometimes
3609 * have to construct the next instruction from the hard way.
3610 * This invokes one step in the lazy-conversion.
3611 */
3612 search_state.ifr = rx_handle_cache_miss (&rxb->rx,
3613 search_state.super,
3614 search_state.c,
3615 search_state.ifr->data_2);
3616 if (!search_state.ifr)
3617 {
3618 search_state.test_ret = rx_test_internal_error;
3619 goto test_do_return;
3620 }
3621 goto restart;
3622
3623 case rx_backtrack:
3624 /* RX_BACKTRACK means that we've reached the empty
3625 * superstate, indicating that match can't succeed
3626 * from this point.
3627 */
3628 goto test_do_return;
3629
3630 case rx_next_char:
3631 case rx_error_inx:
3632 case rx_num_instructions:
3633 search_state.ret_val = 0;
3634 goto test_do_return;
3635 }
3636 goto pseudo_while_1;
3637 }
3638
3639 /* Healthy exits from the test-match loop do a
3640 * `goto return_from_test_match' On the other hand,
3641 * we might end up here.
3642 */
3643 error_in_testing_match:
3644 test_state = rx_test_error;
3645 goto test_returns_to_search;
3646
3647 /***** fastmap/search loop body
3648 * considering the results testing for a match
3649 */
3650
3651 return_from_test_match:
3652
3653 if (search_state.best_last_l >= 0)
3654 {
3655 if (regs && (regs->start != search_state.best_lparen))
3656 {
3657 bcopy (search_state.best_lparen, regs->start,
3658 regs->num_regs * sizeof (int));
3659 bcopy (search_state.best_rparen, regs->end,
3660 regs->num_regs * sizeof (int));
3661 }
3662 if (regs && !rxb->no_sub)
3663 {
3664 int q;
3665 int bound = (regs->num_regs > search_state.num_regs
3666 ? regs->num_regs
3667 : search_state.num_regs);
3668 regoff_t * s = regs->start;
3669 regoff_t * e = regs->end;
3670 for (q = search_state.best_last_l + 1; q < bound; ++q)
3671 s[q] = e[q] = -1;
3672 }
3673 search_state.ret_val = search_state.best_lparen[0];
3674 test_state = rx_test_ok;
3675 goto test_returns_to_search;
3676 }
3677 else
3678 {
3679 test_state = rx_test_fail;
3680 goto test_returns_to_search;
3681 }
3682
3683 test_return_continuation:
3684 search_state.test_match_resume_pt = test_pc;
3685 test_state = rx_test_continuation;
3686 goto test_returns_to_search;
3687 }
3688}
3689
3690
3691
3692#endif /* RX_WANT_RX_DEFS */
3693
3694
3695
3696#else /* RX_WANT_SE_DEFS */
3697 /* Integers are used to represent side effects.
3698 *
3699 * Simple side effects are given negative integer names by these enums.
3700 *
3701 * Non-negative names are reserved for complex effects.
3702 *
3703 * Complex effects are those that take arguments. For example,
3704 * a register assignment associated with a group is complex because
3705 * it requires an argument to tell which group is being matched.
3706 *
3707 * The integer name of a complex effect is an index into rxb->se_params.
3708 */
3709
3710 RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */
3711
3712 RX_DEF_SE(0, re_se_pushback, = re_se_try - 1)
3713 RX_DEF_SE(0, re_se_push0, = re_se_pushback -1)
3714 RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1)
3715 RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1)
3716 RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1)
3717
3718 RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */
3719 RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */
3720 RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */
3721
3722 RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */
3723 RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */
3724
3725 RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1)
3726 RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1)
3727 RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1)
3728
3729 RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1)
3730 RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1)
3731
3732 /* This fails except at the end of a line.
3733 * It deserves to go here since it is typicly one of the last steps
3734 * in a match.
3735 */
3736 RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1)
3737
3738 /* Simple effects: */
3739 RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1)
3740
3741 /* Complex effects. These are used in the 'se' field of
3742 * a struct re_se_params. Indexes into the se array
3743 * are stored as instructions on nfa edges.
3744 */
3745 RX_DEF_CPLX_SE(1, re_se_win, = 0)
3746 RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1)
3747 RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1)
3748 RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1)
3749 RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1)
3750 RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1)
3751 RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1)
3752
3753#endif
3754
3755#endif
Note: See TracBrowser for help on using the repository browser.