[3745] | 1 | #if !defined(RXH) || defined(RX_WANT_SE_DEFS)
|
---|
| 2 | #define RXH
|
---|
| 3 |
|
---|
| 4 | /* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
|
---|
| 5 |
|
---|
| 6 | This file is part of the librx library.
|
---|
| 7 |
|
---|
| 8 | Librx is free software; you can redistribute it and/or modify it under
|
---|
| 9 | the terms of the GNU Library General Public License as published by
|
---|
| 10 | the Free Software Foundation; either version 2, or (at your option)
|
---|
| 11 | any later version.
|
---|
| 12 |
|
---|
| 13 | Librx is distributed in the hope that it will be useful, but WITHOUT
|
---|
| 14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
---|
| 15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
---|
| 16 | for more details.
|
---|
| 17 |
|
---|
| 18 | You should have received a copy of the GNU Library General Public
|
---|
| 19 | License along with this software; see the file COPYING.LIB. If not,
|
---|
| 20 | write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
|
---|
| 21 | 02139, USA. */
|
---|
| 22 | /* t. lord Wed Sep 23 18:20:57 1992 */
|
---|
| 23 |
|
---|
[23508] | 24 | #include "mglong.h"
|
---|
[3745] | 25 |
|
---|
| 26 | #ifndef RX_WANT_SE_DEFS
|
---|
| 27 |
|
---|
| 28 | /* This page: Bitsets */
|
---|
| 29 |
|
---|
| 30 | #ifndef RX_subset
|
---|
| 31 | typedef unsigned int RX_subset;
|
---|
| 32 | #define RX_subset_bits (32)
|
---|
| 33 | #define RX_subset_mask (RX_subset_bits - 1)
|
---|
| 34 | #endif
|
---|
| 35 |
|
---|
| 36 | typedef RX_subset * rx_Bitset;
|
---|
| 37 |
|
---|
| 38 | #ifdef __STDC__
|
---|
| 39 | typedef void (*rx_bitset_iterator) (void *, int member_index);
|
---|
| 40 | #else
|
---|
| 41 | typedef void (*rx_bitset_iterator) ();
|
---|
| 42 | #endif
|
---|
| 43 |
|
---|
| 44 | #define rx_bitset_subset(N) ((N) / RX_subset_bits)
|
---|
| 45 | #define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)])
|
---|
| 46 | #define RX_bitset_access(B,N,OP) \
|
---|
| 47 | ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
|
---|
| 48 | #define RX_bitset_member(B,N) RX_bitset_access(B, N, &)
|
---|
| 49 | #define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=)
|
---|
| 50 | #define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~)
|
---|
| 51 | #define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= )
|
---|
| 52 | #define rx_bitset_numb_subsets(N) (((N) + RX_subset_bits - 1) / RX_subset_bits)
|
---|
| 53 | #define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset))
|
---|
| 54 |
|
---|
| 55 | |
---|
| 56 |
|
---|
| 57 |
|
---|
| 58 | /* This page: Splay trees. */
|
---|
| 59 |
|
---|
| 60 | #ifdef __STDC__
|
---|
| 61 | typedef int (*rx_sp_comparer) (void * a, void * b);
|
---|
| 62 | #else
|
---|
| 63 | typedef int (*rx_sp_comparer) ();
|
---|
| 64 | #endif
|
---|
| 65 |
|
---|
| 66 | struct rx_sp_node
|
---|
| 67 | {
|
---|
| 68 | void * key;
|
---|
| 69 | void * data;
|
---|
| 70 | struct rx_sp_node * kids[2];
|
---|
| 71 | };
|
---|
| 72 |
|
---|
| 73 | #ifdef __STDC__
|
---|
| 74 | typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
|
---|
| 75 | #else
|
---|
| 76 | typedef void (*rx_sp_key_data_freer) ();
|
---|
| 77 | #endif
|
---|
| 78 |
|
---|
| 79 | |
---|
| 80 |
|
---|
| 81 | /* giant inflatable hash trees */
|
---|
| 82 |
|
---|
| 83 | struct rx_hash_item
|
---|
| 84 | {
|
---|
[23508] | 85 | struct rx_hash_item * next_same_hash;
|
---|
[3745] | 86 | struct rx_hash * table;
|
---|
| 87 | mg_u_long hash;
|
---|
| 88 | void * data;
|
---|
| 89 | void * binding;
|
---|
| 90 | };
|
---|
| 91 |
|
---|
| 92 | struct rx_hash
|
---|
| 93 | {
|
---|
| 94 | struct rx_hash * parent;
|
---|
| 95 | int refs;
|
---|
| 96 | struct rx_hash * children[13];
|
---|
| 97 | struct rx_hash_item * buckets [13];
|
---|
| 98 | int bucket_size [13];
|
---|
| 99 | };
|
---|
| 100 |
|
---|
| 101 | struct rx_hash_rules;
|
---|
| 102 |
|
---|
| 103 | #ifdef __STDC__
|
---|
| 104 | /* should return like == */
|
---|
| 105 | typedef int (*rx_hash_eq)(void *, void *);
|
---|
| 106 | typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
|
---|
| 107 | typedef void (*rx_free_hash)(struct rx_hash *,
|
---|
| 108 | struct rx_hash_rules *);
|
---|
| 109 | typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
|
---|
| 110 | void *);
|
---|
| 111 | typedef void (*rx_free_hash_item)(struct rx_hash_item *,
|
---|
| 112 | struct rx_hash_rules *);
|
---|
| 113 | #else
|
---|
| 114 | typedef int (*rx_hash_eq)();
|
---|
| 115 | typedef struct rx_hash * (*rx_alloc_hash)();
|
---|
| 116 | typedef void (*rx_free_hash)();
|
---|
| 117 | typedef struct rx_hash_item * (*rx_alloc_hash_item)();
|
---|
| 118 | typedef void (*rx_free_hash_item)();
|
---|
| 119 | #endif
|
---|
| 120 |
|
---|
| 121 | struct rx_hash_rules
|
---|
| 122 | {
|
---|
| 123 | rx_hash_eq eq;
|
---|
| 124 | rx_alloc_hash hash_alloc;
|
---|
| 125 | rx_free_hash free_hash;
|
---|
| 126 | rx_alloc_hash_item hash_item_alloc;
|
---|
| 127 | rx_free_hash_item free_hash_item;
|
---|
| 128 | };
|
---|
| 129 |
|
---|
| 130 | |
---|
| 131 |
|
---|
| 132 | /* Forward declarations */
|
---|
| 133 |
|
---|
| 134 | struct rx_cache;
|
---|
| 135 | struct rx_superset;
|
---|
| 136 | struct rx;
|
---|
| 137 | struct rx_se_list;
|
---|
| 138 |
|
---|
| 139 | |
---|
| 140 |
|
---|
| 141 |
|
---|
| 142 | /*
|
---|
| 143 | * GLOSSARY
|
---|
| 144 | *
|
---|
| 145 | * regexp
|
---|
| 146 | * regular expression
|
---|
| 147 | * expression
|
---|
| 148 | * pattern - a `regular' expression. The expression
|
---|
| 149 | * need not be formally regular -- it can contain
|
---|
| 150 | * constructs that don't correspond to purely regular
|
---|
| 151 | * expressions.
|
---|
| 152 | *
|
---|
| 153 | * buffer
|
---|
| 154 | * string - the string (or strings) being searched or matched.
|
---|
| 155 | *
|
---|
| 156 | * pattern buffer - a structure of type `struct re_pattern_buffer'
|
---|
| 157 | * This in turn contains a `struct rx', which holds the
|
---|
| 158 | * NFA compiled from a pattern, as well as some of the state
|
---|
| 159 | * of a matcher using the pattern.
|
---|
| 160 | *
|
---|
| 161 | * NFA - nondeterministic finite automata. Some people
|
---|
| 162 | * use this term to a member of the class of
|
---|
| 163 | * regular automata (those corresponding to a regular
|
---|
| 164 | * language). However, in this code, the meaning is
|
---|
| 165 | * more general. The automata used by Rx are comperable
|
---|
| 166 | * in power to what are usually called `push down automata'.
|
---|
| 167 | *
|
---|
| 168 | * Two NFA are built by rx for every pattern. One is built
|
---|
| 169 | * by the compiler. The other is built from the first, on
|
---|
| 170 | * the fly, by the matcher. The latter is called the `superstate
|
---|
| 171 | * NFA' because its states correspond to sets of states from
|
---|
| 172 | * the first NFA. (Joe Keane gets credit for the name
|
---|
| 173 | * `superstate NFA').
|
---|
| 174 | *
|
---|
| 175 | * NFA edges
|
---|
| 176 | * epsilon edges
|
---|
| 177 | * side-effect edges - The NFA compiled from a pattern can have three
|
---|
| 178 | * kinds of edges. Epsilon edges can be taken freely anytime
|
---|
| 179 | * their source state is reached. Character set edges can be
|
---|
| 180 | * taken when their source state is reached and when the next
|
---|
| 181 | * character in the buffer is a member of the set. Side effect
|
---|
| 182 | * edges imply a transition that can only be taken after the
|
---|
| 183 | * indicated side effect has been successfully accomplished.
|
---|
| 184 | * Some examples of side effects are:
|
---|
| 185 | *
|
---|
| 186 | * Storing the current match position to record the
|
---|
| 187 | * location of a parentesized subexpression.
|
---|
| 188 | *
|
---|
| 189 | * Advancing the matcher over N characters if they
|
---|
| 190 | * match the N characters previously matched by a
|
---|
| 191 | * parentesized subexpression.
|
---|
| 192 | *
|
---|
| 193 | * Both of those kinds of edges occur in the NFA generated
|
---|
| 194 | * by the pattern: \(.\)\1
|
---|
| 195 | *
|
---|
| 196 | * Epsilon and side effect edges are similar. Unfortunately,
|
---|
| 197 | * some of the code uses the name `epsilon edge' to mean
|
---|
| 198 | * both epsilon and side effect edges. For example, the
|
---|
| 199 | * function has_non_idempotent_epsilon_path computes the existance
|
---|
| 200 | * of a non-trivial path containing only a mix of epsilon and
|
---|
| 201 | * side effect edges. In that case `nonidempotent epsilon' is being
|
---|
| 202 | * used to mean `side effect'.
|
---|
| 203 | */
|
---|
| 204 |
|
---|
| 205 |
|
---|
| 206 |
|
---|
| 207 | |
---|
| 208 |
|
---|
| 209 |
|
---|
| 210 | /* LOW LEVEL PATTERN BUFFERS */
|
---|
| 211 |
|
---|
| 212 | /* Suppose that from some NFA state, more than one path through
|
---|
| 213 | * side-effect edges is possible. In what order should the paths
|
---|
| 214 | * be tried? A function of type rx_se_list_order answers that
|
---|
| 215 | * question. It compares two lists of side effects, and says
|
---|
| 216 | * which list comes first.
|
---|
| 217 | */
|
---|
| 218 |
|
---|
| 219 | #ifdef __STDC__
|
---|
| 220 | typedef int (*rx_se_list_order) (struct rx *,
|
---|
| 221 | struct rx_se_list *,
|
---|
| 222 | struct rx_se_list *);
|
---|
| 223 | #else
|
---|
| 224 | typedef int (*rx_se_list_order) ();
|
---|
| 225 | #endif
|
---|
| 226 |
|
---|
| 227 |
|
---|
| 228 |
|
---|
| 229 | /* Struct RX holds a compiled regular expression - that is, an nfa
|
---|
| 230 | * ready to be converted on demand to a more efficient superstate nfa.
|
---|
| 231 | * This is for the low level interface. The high-level interfaces enclose
|
---|
| 232 | * this in a `struct re_pattern_buffer'.
|
---|
| 233 | */
|
---|
| 234 | struct rx
|
---|
| 235 | {
|
---|
| 236 | /* The compiler assigns a unique id to every pattern.
|
---|
| 237 | * Like sequence numbers in X, there is a subtle bug here
|
---|
| 238 | * if you use Rx in a system that runs for a long time.
|
---|
| 239 | * But, because of the way the caches work out, it is almost
|
---|
| 240 | * impossible to trigger the Rx version of this bug.
|
---|
| 241 | *
|
---|
| 242 | * The id is used to validate superstates found in a cache
|
---|
| 243 | * of superstates. It isn't sufficient to let a superstate
|
---|
| 244 | * point back to the rx for which it was compiled -- the caller
|
---|
| 245 | * may be re-using a `struct rx' in which case the superstate
|
---|
| 246 | * is not really valid. So instead, superstates are validated
|
---|
| 247 | * by checking the sequence number of the pattern for which
|
---|
| 248 | * they were built.
|
---|
| 249 | */
|
---|
| 250 | int rx_id;
|
---|
| 251 |
|
---|
| 252 | /* This is memory mgt. state for superstates. This may be
|
---|
| 253 | * shared by more than one struct rx.
|
---|
| 254 | */
|
---|
| 255 | struct rx_cache * cache;
|
---|
| 256 |
|
---|
| 257 | /* Every regex defines the size of its own character set.
|
---|
| 258 | * A superstate has an array of this size, with each element
|
---|
| 259 | * a `struct rx_inx'. So, don't make this number too large.
|
---|
| 260 | * In particular, don't make it 2^16.
|
---|
| 261 | */
|
---|
| 262 | int local_cset_size;
|
---|
| 263 |
|
---|
[23508] | 264 | /* After the NFA is built, it is copied into a contiguous region
|
---|
[3745] | 265 | * of memory (mostly for compatability with GNU regex).
|
---|
| 266 | * Here is that region, and it's size:
|
---|
| 267 | */
|
---|
| 268 | void * buffer;
|
---|
| 269 | mg_u_long allocated;
|
---|
| 270 |
|
---|
[23508] | 271 | /* Clients of RX can ask for some extra storage in the space pointed
|
---|
[3745] | 272 | * to by BUFFER. The field RESERVED is an input parameter to the
|
---|
| 273 | * compiler. After compilation, this much space will be available
|
---|
| 274 | * at (buffer + allocated - reserved)
|
---|
| 275 | */
|
---|
| 276 | mg_u_long reserved;
|
---|
| 277 |
|
---|
| 278 | /* --------- The remaining fields are for internal use only. --------- */
|
---|
| 279 | /* --------- But! they must be initialized to 0. --------- */
|
---|
| 280 |
|
---|
| 281 | /* NODEC is the number of nodes in the NFA with non-epsilon
|
---|
| 282 | * transitions.
|
---|
| 283 | */
|
---|
| 284 | int nodec;
|
---|
| 285 |
|
---|
| 286 | /* EPSNODEC is the number of nodes with only epsilon transitions. */
|
---|
| 287 | int epsnodec;
|
---|
| 288 |
|
---|
| 289 | /* The sum (NODEC + EPSNODEC) is the total number of states in the
|
---|
| 290 | * compiled NFA.
|
---|
| 291 | */
|
---|
| 292 |
|
---|
| 293 | /* Lists of side effects as stored in the NFA are `hash consed'..meaning
|
---|
| 294 | * that lists with the same elements are ==. During compilation,
|
---|
| 295 | * this table facilitates hash-consing.
|
---|
| 296 | */
|
---|
| 297 | struct rx_hash se_list_memo;
|
---|
| 298 |
|
---|
| 299 | /* Lists of NFA states are also hashed.
|
---|
| 300 | */
|
---|
| 301 | struct rx_hash set_list_memo;
|
---|
| 302 |
|
---|
| 303 |
|
---|
| 304 |
|
---|
| 305 |
|
---|
| 306 | /* The compiler and matcher must build a number of instruction frames.
|
---|
| 307 | * The format of these frames is fixed (c.f. struct rx_inx). The values
|
---|
| 308 | * of the instructions is not fixed.
|
---|
| 309 | *
|
---|
| 310 | * An enumerated type (enum rx_opcode) defines the set of instructions
|
---|
| 311 | * that the compiler or matcher might generate. When filling an instruction
|
---|
| 312 | * frame, the INX field is found by indexing this instruction table
|
---|
| 313 | * with an opcode:
|
---|
| 314 | */
|
---|
| 315 | void ** instruction_table;
|
---|
| 316 |
|
---|
| 317 | /* The list of all states in an NFA.
|
---|
| 318 | * During compilation, the NEXT field of NFA states links this list.
|
---|
| 319 | * After compilation, all the states are compacted into an array,
|
---|
| 320 | * ordered by state id numbers. At that time, this points to the base
|
---|
| 321 | * of that array.
|
---|
| 322 | */
|
---|
| 323 | struct rx_nfa_state *nfa_states;
|
---|
| 324 |
|
---|
| 325 | /* Every nfa begins with one distinguished starting state:
|
---|
| 326 | */
|
---|
| 327 | struct rx_nfa_state *start;
|
---|
| 328 |
|
---|
| 329 | /* This orders the search through super-nfa paths.
|
---|
| 330 | * See the comment near the typedef of rx_se_list_order.
|
---|
| 331 | */
|
---|
| 332 | rx_se_list_order se_list_cmp;
|
---|
| 333 |
|
---|
| 334 | struct rx_superset * start_set;
|
---|
| 335 | };
|
---|
| 336 |
|
---|
| 337 |
|
---|
| 338 |
|
---|
| 339 | |
---|
| 340 |
|
---|
| 341 | /* SYNTAX TREES */
|
---|
| 342 |
|
---|
| 343 | /* Compilation is in stages.
|
---|
| 344 | *
|
---|
| 345 | * In the first stage, a pattern specified by a string is
|
---|
| 346 | * translated into a syntax tree. Later stages will convert
|
---|
| 347 | * the syntax tree into an NFA optimized for conversion to a
|
---|
| 348 | * superstate-NFA.
|
---|
| 349 | *
|
---|
| 350 | * This page is about syntax trees.
|
---|
| 351 | */
|
---|
| 352 |
|
---|
| 353 | enum rexp_node_type
|
---|
| 354 | {
|
---|
| 355 | r_cset, /* Match from a character set. `a' or `[a-z]'*/
|
---|
| 356 | r_concat, /* Concat two subexpressions. `ab' */
|
---|
| 357 | r_alternate, /* Choose one of two subexpressions. `a\|b' */
|
---|
| 358 | r_opt, /* Optional subexpression. `a?' */
|
---|
| 359 | r_star, /* Repeated subexpression. `a*' */
|
---|
| 360 |
|
---|
| 361 |
|
---|
| 362 | /* A 2phase-star is a variation on a repeated subexpression.
|
---|
| 363 | * In this case, there are two subexpressions. The first, if matched,
|
---|
| 364 | * begins a repitition (otherwise, the whole expression is matches the
|
---|
| 365 | * empth string).
|
---|
| 366 | *
|
---|
| 367 | * After matching the first subexpression, a 2phase star either finishes,
|
---|
| 368 | * or matches the second subexpression. If the second subexpression is
|
---|
| 369 | * matched, then the whole construct repeats.
|
---|
| 370 | *
|
---|
| 371 | * 2phase stars are used in two circumstances. First, they
|
---|
| 372 | * are used as part of the implementation of POSIX intervals (counted
|
---|
| 373 | * repititions). Second, they are used to implement proper star
|
---|
| 374 | * semantics when the repeated subexpression contains paths of
|
---|
| 375 | * only side effects. See rx_compile for more information.
|
---|
| 376 | */
|
---|
| 377 | r_2phase_star,
|
---|
| 378 |
|
---|
| 379 |
|
---|
| 380 | /* c.f. "typedef void * rx_side_effect" */
|
---|
| 381 | r_side_effect,
|
---|
| 382 |
|
---|
| 383 | /* This is an extension type: It is for transient use in source->source
|
---|
| 384 | * transformations (implemented over syntax trees).
|
---|
| 385 | */
|
---|
| 386 | r_data
|
---|
| 387 | };
|
---|
| 388 |
|
---|
| 389 | /* A side effect is a matcher-specific action associated with
|
---|
| 390 | * transitions in the NFA. The details of side effects are up
|
---|
| 391 | * to the matcher. To the compiler and superstate constructors
|
---|
| 392 | * side effects are opaque:
|
---|
| 393 | */
|
---|
| 394 |
|
---|
| 395 | typedef void * rx_side_effect;
|
---|
| 396 |
|
---|
| 397 | /* Nodes in a syntax tree are of this type:
|
---|
| 398 | */
|
---|
| 399 | struct rexp_node
|
---|
| 400 | {
|
---|
| 401 | enum rexp_node_type type;
|
---|
| 402 | union
|
---|
| 403 | {
|
---|
| 404 | rx_Bitset cset;
|
---|
| 405 | rx_side_effect side_effect;
|
---|
| 406 | struct
|
---|
| 407 | {
|
---|
| 408 | struct rexp_node *left;
|
---|
| 409 | struct rexp_node *right;
|
---|
| 410 | } pair;
|
---|
| 411 | void * data;
|
---|
| 412 | } params;
|
---|
| 413 | };
|
---|
| 414 |
|
---|
| 415 |
|
---|
| 416 | |
---|
| 417 |
|
---|
| 418 | /* NFA
|
---|
| 419 | *
|
---|
| 420 | * A syntax tree is compiled into an NFA. This page defines the structure
|
---|
| 421 | * of that NFA.
|
---|
| 422 | */
|
---|
| 423 |
|
---|
| 424 | struct rx_nfa_state
|
---|
| 425 | {
|
---|
| 426 | /* These are kept in a list as the NFA is being built. */
|
---|
| 427 | struct rx_nfa_state *next;
|
---|
| 428 |
|
---|
| 429 | /* After the NFA is built, states are given integer id's.
|
---|
| 430 | * States whose outgoing transitions are all either epsilon or
|
---|
| 431 | * side effect edges are given ids less than 0. Other states
|
---|
| 432 | * are given successive non-negative ids starting from 0.
|
---|
| 433 | */
|
---|
| 434 | int id;
|
---|
| 435 |
|
---|
| 436 | /* The list of NFA edges that go from this state to some other. */
|
---|
| 437 | struct rx_nfa_edge *edges;
|
---|
| 438 |
|
---|
| 439 | /* If you land in this state, then you implicitly land
|
---|
| 440 | * in all other states reachable by only epsilon translations.
|
---|
| 441 | * Call the set of maximal paths to such states the epsilon closure
|
---|
| 442 | * of this state.
|
---|
| 443 | *
|
---|
| 444 | * There may be other states that are reachable by a mixture of
|
---|
| 445 | * epsilon and side effect edges. Consider the set of maximal paths
|
---|
| 446 | * of that sort from this state. Call it the epsilon-side-effect
|
---|
| 447 | * closure of the state.
|
---|
| 448 | *
|
---|
| 449 | * The epsilon closure of the state is a subset of the epsilon-side-
|
---|
| 450 | * effect closure. It consists of all the paths that contain
|
---|
| 451 | * no side effects -- only epsilon edges.
|
---|
| 452 | *
|
---|
| 453 | * The paths in the epsilon-side-effect closure can be partitioned
|
---|
| 454 | * into equivalance sets. Two paths are equivalant if they have the
|
---|
| 455 | * same set of side effects, in the same order. The epsilon-closure
|
---|
| 456 | * is one of these equivalance sets. Let's call these equivalance
|
---|
| 457 | * sets: observably equivalant path sets. That name is chosen
|
---|
| 458 | * because equivalance of two paths means they cause the same side
|
---|
| 459 | * effects -- so they lead to the same subsequent observations other
|
---|
| 460 | * than that they may wind up in different target states.
|
---|
| 461 | *
|
---|
| 462 | * The superstate nfa, which is derived from this nfa, is based on
|
---|
| 463 | * the observation that all of the paths in an observably equivalant
|
---|
| 464 | * path set can be explored at the same time, provided that the
|
---|
| 465 | * matcher keeps track not of a single nfa state, but of a set of
|
---|
| 466 | * states. In particular, after following all the paths in an
|
---|
| 467 | * observably equivalant set, you wind up at a set of target states.
|
---|
| 468 | * That set of target states corresponds to one state in the
|
---|
| 469 | * superstate NFA.
|
---|
| 470 | *
|
---|
| 471 | * Staticly, before matching begins, it is convenient to analyze the
|
---|
| 472 | * nfa. Each state is labeled with a list of the observably
|
---|
| 473 | * equivalant path sets who's union covers all the
|
---|
| 474 | * epsilon-side-effect paths beginning in this state. This list is
|
---|
| 475 | * called the possible futures of the state.
|
---|
| 476 | *
|
---|
| 477 | * A trivial example is this NFA:
|
---|
| 478 | * s1
|
---|
| 479 | * A ---> B
|
---|
| 480 | *
|
---|
| 481 | * s2
|
---|
| 482 | * ---> C
|
---|
| 483 | *
|
---|
| 484 | * epsilon s1
|
---|
| 485 | * ---------> D ------> E
|
---|
| 486 | *
|
---|
| 487 | *
|
---|
| 488 | * In this example, A has two possible futures.
|
---|
| 489 | * One invokes the side effect `s1' and contains two paths,
|
---|
| 490 | * one ending in state B, the other in state E.
|
---|
| 491 | * The other invokes the side effect `s2' and contains only
|
---|
| 492 | * one path, landing in state C.
|
---|
| 493 | */
|
---|
| 494 | struct rx_possible_future *futures;
|
---|
| 495 |
|
---|
| 496 |
|
---|
| 497 | /* There are exactly two distinguished states in every NFA: */
|
---|
| 498 | unsigned int is_final:1;
|
---|
| 499 | unsigned int is_start:1;
|
---|
| 500 |
|
---|
| 501 | /* These are used during NFA construction... */
|
---|
| 502 | unsigned int eclosure_needed:1;
|
---|
| 503 | unsigned int mark:1;
|
---|
| 504 | };
|
---|
| 505 |
|
---|
| 506 |
|
---|
| 507 | /* An edge in an NFA is typed: */
|
---|
| 508 | enum rx_nfa_etype
|
---|
| 509 | {
|
---|
| 510 | /* A cset edge is labled with a set of characters one of which
|
---|
| 511 | * must be matched for the edge to be taken.
|
---|
| 512 | */
|
---|
| 513 | ne_cset,
|
---|
| 514 |
|
---|
| 515 | /* An epsilon edge is taken whenever its starting state is
|
---|
| 516 | * reached.
|
---|
| 517 | */
|
---|
| 518 | ne_epsilon,
|
---|
| 519 |
|
---|
| 520 | /* A side effect edge is taken whenever its starting state is
|
---|
| 521 | * reached. Side effects may cause the match to fail or the
|
---|
| 522 | * position of the matcher to advance.
|
---|
| 523 | */
|
---|
| 524 | ne_side_effect /* A special kind of epsilon. */
|
---|
| 525 | };
|
---|
| 526 |
|
---|
| 527 | struct rx_nfa_edge
|
---|
| 528 | {
|
---|
| 529 | struct rx_nfa_edge *next;
|
---|
| 530 | enum rx_nfa_etype type;
|
---|
| 531 | struct rx_nfa_state *dest;
|
---|
| 532 | union
|
---|
| 533 | {
|
---|
| 534 | rx_Bitset cset;
|
---|
| 535 | rx_side_effect side_effect;
|
---|
| 536 | } params;
|
---|
| 537 | };
|
---|
| 538 |
|
---|
| 539 |
|
---|
| 540 |
|
---|
| 541 | /* A possible future consists of a list of side effects
|
---|
| 542 | * and a set of destination states. Below are their
|
---|
| 543 | * representations. These structures are hash-consed which
|
---|
| 544 | * means that lists with the same elements share a representation
|
---|
| 545 | * (their addresses are ==).
|
---|
| 546 | */
|
---|
| 547 |
|
---|
| 548 | struct rx_nfa_state_set
|
---|
| 549 | {
|
---|
| 550 | struct rx_nfa_state * car;
|
---|
| 551 | struct rx_nfa_state_set * cdr;
|
---|
| 552 | };
|
---|
| 553 |
|
---|
| 554 | struct rx_se_list
|
---|
| 555 | {
|
---|
| 556 | rx_side_effect car;
|
---|
| 557 | struct rx_se_list * cdr;
|
---|
| 558 | };
|
---|
| 559 |
|
---|
| 560 | struct rx_possible_future
|
---|
| 561 | {
|
---|
| 562 | struct rx_possible_future *next;
|
---|
| 563 | struct rx_se_list * effects;
|
---|
| 564 | struct rx_nfa_state_set * destset;
|
---|
| 565 | };
|
---|
| 566 |
|
---|
| 567 | |
---|
| 568 |
|
---|
| 569 |
|
---|
| 570 | /* This begins the description of the superstate NFA.
|
---|
| 571 | *
|
---|
| 572 | * The superstate NFA corresponds to the NFA in these ways:
|
---|
| 573 | *
|
---|
| 574 | * Every superstate NFA states SUPER correspond to sets of NFA states,
|
---|
| 575 | * nfa_states(SUPER).
|
---|
| 576 | *
|
---|
| 577 | * Superstate edges correspond to NFA paths.
|
---|
| 578 | *
|
---|
| 579 | * The superstate has no epsilon transitions;
|
---|
| 580 | * every edge has a character label, and a (possibly empty) side
|
---|
| 581 | * effect label. The side effect label corresponds to a list of
|
---|
| 582 | * side effects that occur in the NFA. These parts are referred
|
---|
| 583 | * to as: superedge_character(EDGE) and superedge_sides(EDGE).
|
---|
| 584 | *
|
---|
| 585 | * For a superstate edge EDGE starting in some superstate SUPER,
|
---|
| 586 | * the following is true (in pseudo-notation :-):
|
---|
| 587 | *
|
---|
| 588 | * exists DEST in nfa_states s.t.
|
---|
| 589 | * exists nfaEDGE in nfa_edges s.t.
|
---|
| 590 | * origin (nfaEDGE) == DEST
|
---|
| 591 | * && origin (nfaEDGE) is a member of nfa_states(SUPER)
|
---|
| 592 | * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
|
---|
| 593 | * sides_of_possible_future (PF) == superedge_sides (EDGE)
|
---|
| 594 | *
|
---|
| 595 | * also:
|
---|
| 596 | *
|
---|
| 597 | * let SUPER2 := superedge_destination(EDGE)
|
---|
| 598 | * nfa_states(SUPER2)
|
---|
| 599 | * == union of all nfa state sets S s.t.
|
---|
| 600 | * exists PF in possible_futures(dest(nfaEDGE)) s.t.
|
---|
| 601 | * sides_of_possible_future (PF) == superedge_sides (EDGE)
|
---|
| 602 | * && S == dests_of_possible_future (PF) }
|
---|
| 603 | *
|
---|
| 604 | * Or in english, every superstate is a set of nfa states. A given
|
---|
| 605 | * character and a superstate implies many transitions in the NFA --
|
---|
| 606 | * those that begin with an edge labeled with that character from a
|
---|
| 607 | * state in the set corresponding to the superstate.
|
---|
| 608 | *
|
---|
| 609 | * The destinations of those transitions each have a set of possible
|
---|
| 610 | * futures. A possible future is a list of side effects and a set of
|
---|
| 611 | * destination NFA states. Two sets of possible futures can be
|
---|
| 612 | * `merged' by combining all pairs of possible futures that have the
|
---|
| 613 | * same side effects. A pair is combined by creating a new future
|
---|
| 614 | * with the same side effect but the union of the two destination sets.
|
---|
| 615 | * In this way, all the possible futures suggested by a superstate
|
---|
| 616 | * and a character can be merged into a set of possible futures where
|
---|
| 617 | * no two elements of the set have the same set of side effects.
|
---|
| 618 | *
|
---|
| 619 | * The destination of a possible future, being a set of NFA states,
|
---|
| 620 | * corresponds to a supernfa state. So, the merged set of possible
|
---|
| 621 | * futures we just created can serve as a set of edges in the
|
---|
| 622 | * supernfa.
|
---|
| 623 | *
|
---|
| 624 | * The representation of the superstate nfa and the nfa is critical.
|
---|
| 625 | * The nfa has to be compact, but has to facilitate the rapid
|
---|
| 626 | * computation of missing superstates. The superstate nfa has to
|
---|
| 627 | * be fast to interpret, lazilly constructed, and bounded in space.
|
---|
| 628 | *
|
---|
| 629 | * To facilitate interpretation, the superstate data structures are
|
---|
| 630 | * peppered with `instruction frames'. There is an instruction set
|
---|
| 631 | * defined below which matchers using the supernfa must be able to
|
---|
| 632 | * interpret.
|
---|
| 633 | *
|
---|
| 634 | * We'd like to make it possible but not mandatory to use code
|
---|
| 635 | * addresses to represent instructions (c.f. gcc's computed goto).
|
---|
| 636 | * Therefore, we define an enumerated type of opcodes, and when
|
---|
| 637 | * writing one of these instructions into a data structure, use
|
---|
| 638 | * the opcode as an index into a table of instruction values.
|
---|
| 639 | *
|
---|
| 640 | * Here are the opcodes that occur in the superstate nfa:
|
---|
| 641 | */
|
---|
| 642 |
|
---|
| 643 |
|
---|
| 644 | /* Every superstate contains a table of instruction frames indexed
|
---|
| 645 | * by characters. A normal `move' in a matcher is to fetch the next
|
---|
| 646 | * character and use it as an index into a superstates transition
|
---|
| 647 | * table.
|
---|
| 648 | *
|
---|
| 649 | * In the fasted case, only one edge follows from that character.
|
---|
| 650 | * In other cases there is more work to do.
|
---|
| 651 | *
|
---|
| 652 | * The descriptions of the opcodes refer to data structures that are
|
---|
| 653 | * described further below.
|
---|
| 654 | */
|
---|
| 655 |
|
---|
| 656 | enum rx_opcode
|
---|
| 657 | {
|
---|
| 658 | /*
|
---|
| 659 | * BACKTRACK_POINT is invoked when a character transition in
|
---|
| 660 | * a superstate leads to more than one edge. In that case,
|
---|
| 661 | * the edges have to be explored independently using a backtracking
|
---|
| 662 | * strategy.
|
---|
| 663 | *
|
---|
| 664 | * A BACKTRACK_POINT instruction is stored in a superstate's
|
---|
| 665 | * transition table for some character when it is known that that
|
---|
| 666 | * character crosses more than one edge. On encountering this
|
---|
| 667 | * instruction, the matcher saves enough state to backtrack to this
|
---|
| 668 | * point in the match later.
|
---|
| 669 | */
|
---|
| 670 | rx_backtrack_point = 0, /* data is (struct transition_class *) */
|
---|
| 671 |
|
---|
| 672 | /*
|
---|
| 673 | * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
|
---|
| 674 | * There is one occurence of this instruction per rx_distinct_future.
|
---|
| 675 | * This instruction is skipped if a rx_distinct_future has no side effects.
|
---|
| 676 | */
|
---|
| 677 | rx_do_side_effects = rx_backtrack_point + 1,
|
---|
| 678 |
|
---|
| 679 | /* data is (struct rx_distinct_future *) */
|
---|
| 680 |
|
---|
| 681 | /*
|
---|
| 682 | * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
|
---|
| 683 | * destination superstate has been reclaimed (or was never built).
|
---|
| 684 | * It recomputes the destination superstate.
|
---|
| 685 | * RX_CACHE_MISS is also stored in a superstate transition table before
|
---|
| 686 | * any of its edges have been built.
|
---|
| 687 | */
|
---|
| 688 | rx_cache_miss = rx_do_side_effects + 1,
|
---|
| 689 | /* data is (struct rx_distinct_future *) */
|
---|
| 690 |
|
---|
| 691 | /*
|
---|
| 692 | * RX_NEXT_CHAR is called to consume the next character and take the
|
---|
| 693 | * corresponding transition. This is the only instruction that uses
|
---|
| 694 | * the DATA field of the instruction frame instead of DATA_2.
|
---|
| 695 | * (see EXPLORE_FUTURE in regex.c).
|
---|
| 696 | */
|
---|
| 697 | rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
|
---|
| 698 |
|
---|
| 699 | /* RX_BACKTRACK indicates that a transition fails.
|
---|
| 700 | */
|
---|
| 701 | rx_backtrack = rx_next_char + 1, /* no data */
|
---|
| 702 |
|
---|
| 703 | /*
|
---|
| 704 | * RX_ERROR_INX is stored only in places that should never be executed.
|
---|
| 705 | */
|
---|
| 706 | rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
|
---|
| 707 |
|
---|
| 708 | rx_num_instructions = rx_error_inx + 1
|
---|
| 709 | };
|
---|
| 710 |
|
---|
| 711 | /* An id_instruction_table holds the values stored in instruction
|
---|
| 712 | * frames. The table is indexed by the enums declared above.
|
---|
| 713 | */
|
---|
| 714 | extern void * rx_id_instruction_table[rx_num_instructions];
|
---|
| 715 |
|
---|
| 716 | /* The heart of the matcher is a `word-code-interpreter'
|
---|
| 717 | * (like a byte-code interpreter, except that instructions
|
---|
| 718 | * are a full word wide).
|
---|
| 719 | *
|
---|
| 720 | * Instructions are not stored in a vector of code, instead,
|
---|
| 721 | * they are scattered throughout the data structures built
|
---|
| 722 | * by the regexp compiler and the matcher. One word-code instruction,
|
---|
| 723 | * together with the arguments to that instruction, constitute
|
---|
| 724 | * an instruction frame (struct rx_inx).
|
---|
| 725 | *
|
---|
| 726 | * This structure type is padded by hand to a power of 2 because
|
---|
| 727 | * in one of the dominant cases, we dispatch by indexing a table
|
---|
| 728 | * of instruction frames. If that indexing can be accomplished
|
---|
| 729 | * by just a shift of the index, we're happy.
|
---|
| 730 | *
|
---|
| 731 | * Instructions take at most one argument, but there are two
|
---|
| 732 | * slots in an instruction frame that might hold that argument.
|
---|
| 733 | * These are called data and data_2. The data slot is only
|
---|
| 734 | * used for one instruction (RX_NEXT_CHAR). For all other
|
---|
| 735 | * instructions, data should be set to 0.
|
---|
| 736 | *
|
---|
| 737 | * RX_NEXT_CHAR is the most important instruction by far.
|
---|
| 738 | * By reserving the data field for its exclusive use,
|
---|
| 739 | * instruction dispatch is sped up in that case. There is
|
---|
| 740 | * no need to fetch both the instruction and the data,
|
---|
| 741 | * only the data is needed. In other words, a `cycle' begins
|
---|
| 742 | * by fetching the field data. If that is non-0, then it must
|
---|
| 743 | * be the destination state of a next_char transition, so
|
---|
| 744 | * make that value the current state, advance the match position
|
---|
| 745 | * by one character, and start a new cycle. On the other hand,
|
---|
| 746 | * if data is 0, fetch the instruction and do a more complicated
|
---|
| 747 | * dispatch on that.
|
---|
| 748 | */
|
---|
| 749 |
|
---|
| 750 | struct rx_inx
|
---|
| 751 | {
|
---|
| 752 | void * data;
|
---|
| 753 | void * data_2;
|
---|
| 754 | void * inx;
|
---|
| 755 | void * fnord;
|
---|
| 756 | };
|
---|
| 757 |
|
---|
| 758 | #ifndef RX_TAIL_ARRAY
|
---|
| 759 | #define RX_TAIL_ARRAY 1
|
---|
| 760 | #endif
|
---|
| 761 |
|
---|
| 762 | /* A superstate corresponds to a set of nfa states. Those sets are
|
---|
| 763 | * represented by STRUCT RX_SUPERSET. The constructors
|
---|
| 764 | * guarantee that only one (shared) structure is created for a given set.
|
---|
| 765 | */
|
---|
| 766 | struct rx_superset
|
---|
| 767 | {
|
---|
| 768 | int refs; /* This is a reference counted structure. */
|
---|
| 769 |
|
---|
| 770 | /* We keep these sets in a cache because (in an unpredictable way),
|
---|
| 771 | * the same set is often created again and again. But that is also
|
---|
| 772 | * problematic -- compatibility with POSIX and GNU regex requires
|
---|
| 773 | * that we not be able to tell when a program discards a particular
|
---|
| 774 | * NFA (thus invalidating the supersets created from it).
|
---|
| 775 | *
|
---|
| 776 | * But when a cache hit appears to occur, we will have in hand the
|
---|
| 777 | * nfa for which it may have happened. That is why every nfa is given
|
---|
| 778 | * its own sequence number. On a cache hit, the cache is validated
|
---|
| 779 | * by comparing the nfa sequence number to this field:
|
---|
| 780 | */
|
---|
| 781 | int id;
|
---|
| 782 |
|
---|
| 783 | struct rx_nfa_state * car; /* May or may not be a valid addr. */
|
---|
| 784 | struct rx_superset * cdr;
|
---|
| 785 |
|
---|
| 786 | /* If the corresponding superstate exists: */
|
---|
| 787 | struct rx_superstate * superstate;
|
---|
| 788 |
|
---|
| 789 |
|
---|
| 790 | /* There is another bookkeeping problem. It is expensive to
|
---|
| 791 | * compute the starting nfa state set for an nfa. So, once computed,
|
---|
| 792 | * it is cached in the `struct rx'.
|
---|
| 793 | *
|
---|
| 794 | * But, the state set can be flushed from the superstate cache.
|
---|
| 795 | * When that happens, we can't know if the corresponding `struct rx'
|
---|
| 796 | * is still alive or if it has been freed or re-used by the program.
|
---|
| 797 | * So, the cached pointer to this set in a struct rx might be invalid
|
---|
| 798 | * and we need a way to validate it.
|
---|
| 799 | *
|
---|
| 800 | * Fortunately, even if this set is flushed from the cache, it is
|
---|
| 801 | * not freed. It just goes on the free-list of supersets.
|
---|
| 802 | * So we can still examine it.
|
---|
| 803 | *
|
---|
| 804 | * So to validate a starting set memo, check to see if the
|
---|
| 805 | * starts_for field still points back to the struct rx in question,
|
---|
| 806 | * and if the ID matches the rx sequence number.
|
---|
| 807 | */
|
---|
| 808 | struct rx * starts_for;
|
---|
| 809 |
|
---|
| 810 | /* This is used to link into a hash bucket so these objects can
|
---|
| 811 | * be `hash-consed'.
|
---|
| 812 | */
|
---|
| 813 | struct rx_hash_item hash_item;
|
---|
| 814 | };
|
---|
| 815 |
|
---|
| 816 | #define rx_protect_superset(RX,CON) (++(CON)->refs)
|
---|
| 817 |
|
---|
| 818 | /* The terminology may be confusing (rename this structure?).
|
---|
| 819 | * Every character occurs in at most one rx_super_edge per super-state.
|
---|
| 820 | * But, that structure might have more than one option, indicating a point
|
---|
| 821 | * of non-determinism.
|
---|
| 822 | *
|
---|
| 823 | * In other words, this structure holds a list of superstate edges
|
---|
| 824 | * sharing a common starting state and character label. The edges
|
---|
| 825 | * are in the field OPTIONS. All superstate edges sharing the same
|
---|
| 826 | * starting state and character are in this list.
|
---|
| 827 | */
|
---|
| 828 | struct rx_super_edge
|
---|
| 829 | {
|
---|
| 830 | struct rx_super_edge *next;
|
---|
| 831 | struct rx_inx rx_backtrack_frame;
|
---|
| 832 | int cset_size;
|
---|
| 833 | rx_Bitset cset;
|
---|
| 834 | struct rx_distinct_future *options;
|
---|
| 835 | };
|
---|
| 836 |
|
---|
| 837 | /* A superstate is a set of nfa states (RX_SUPERSET) along
|
---|
| 838 | * with a transition table. Superstates are built on demand and reclaimed
|
---|
| 839 | * without warning. To protect a superstate from this ghastly fate,
|
---|
| 840 | * use LOCK_SUPERSTATE.
|
---|
| 841 | */
|
---|
| 842 | struct rx_superstate
|
---|
| 843 | {
|
---|
| 844 | int rx_id; /* c.f. the id field of rx_superset */
|
---|
| 845 | int locks; /* protection from reclamation */
|
---|
| 846 |
|
---|
| 847 | /* Within a superstate cache, all the superstates are kept in a big
|
---|
| 848 | * queue. The tail of the queue is the state most likely to be
|
---|
| 849 | * reclaimed. The *recyclable fields hold the queue position of
|
---|
| 850 | * this state.
|
---|
| 851 | */
|
---|
| 852 | struct rx_superstate * next_recyclable;
|
---|
| 853 | struct rx_superstate * prev_recyclable;
|
---|
| 854 |
|
---|
| 855 | /* The supernfa edges that exist in the cache and that have
|
---|
| 856 | * this state as their destination are kept in this list:
|
---|
| 857 | */
|
---|
| 858 | struct rx_distinct_future * transition_refs;
|
---|
| 859 |
|
---|
| 860 | /* The list of nfa states corresponding to this superstate: */
|
---|
| 861 | struct rx_superset * contents;
|
---|
| 862 |
|
---|
| 863 | /* The list of edges in the cache beginning from this state. */
|
---|
| 864 | struct rx_super_edge * edges;
|
---|
| 865 |
|
---|
| 866 | /* A tail of the recyclable queue is marked as semifree. A semifree
|
---|
| 867 | * state has no incoming next_char transitions -- any transition
|
---|
| 868 | * into a semifree state causes a complex dispatch with the side
|
---|
| 869 | * effect of rescuing the state from its semifree state.
|
---|
| 870 | *
|
---|
| 871 | * An alternative to this might be to make next_char more expensive,
|
---|
| 872 | * and to move a state to the head of the recyclable queue whenever
|
---|
| 873 | * it is entered. That way, popular states would never be recycled.
|
---|
| 874 | *
|
---|
| 875 | * But unilaterally making next_char more expensive actually loses.
|
---|
| 876 | * So, incoming transitions are only made expensive for states near
|
---|
| 877 | * the tail of the recyclable queue. The more cache contention
|
---|
| 878 | * there is, the more frequently a state will have to prove itself
|
---|
| 879 | * and be moved back to the front of the queue. If there is less
|
---|
| 880 | * contention, then popular states just aggregate in the front of
|
---|
| 881 | * the queue and stay there.
|
---|
| 882 | */
|
---|
| 883 | int is_semifree;
|
---|
| 884 |
|
---|
| 885 |
|
---|
| 886 | /* This keeps track of the size of the transition table for this
|
---|
| 887 | * state. There is a half-hearted attempt to support variable sized
|
---|
| 888 | * superstates.
|
---|
| 889 | */
|
---|
| 890 | int trans_size;
|
---|
| 891 |
|
---|
| 892 | /* Indexed by characters... */
|
---|
| 893 | struct rx_inx transitions[RX_TAIL_ARRAY];
|
---|
| 894 | };
|
---|
| 895 |
|
---|
| 896 |
|
---|
| 897 | /* A list of distinct futures define the edges that leave from a
|
---|
| 898 | * given superstate on a given character. c.f. rx_super_edge.
|
---|
| 899 | */
|
---|
| 900 |
|
---|
| 901 | struct rx_distinct_future
|
---|
| 902 | {
|
---|
| 903 | struct rx_distinct_future * next_same_super_edge[2];
|
---|
| 904 | struct rx_distinct_future * next_same_dest;
|
---|
| 905 | struct rx_distinct_future * prev_same_dest;
|
---|
| 906 | struct rx_superstate * present; /* source state */
|
---|
| 907 | struct rx_superstate * future; /* destination state */
|
---|
| 908 | struct rx_super_edge * edge;
|
---|
| 909 |
|
---|
| 910 |
|
---|
| 911 | /* The future_frame holds the instruction that should be executed
|
---|
| 912 | * after all the side effects are done, when it is time to complete
|
---|
| 913 | * the transition to the next state.
|
---|
| 914 | *
|
---|
| 915 | * Normally this is a next_char instruction, but it may be a
|
---|
| 916 | * cache_miss instruction as well, depending on whether or not
|
---|
| 917 | * the superstate is in the cache and semifree.
|
---|
| 918 | *
|
---|
| 919 | * If this is the only future for a given superstate/char, and
|
---|
| 920 | * if there are no side effects to be performed, this frame is
|
---|
| 921 | * not used (directly) at all. Instead, its contents are copied
|
---|
| 922 | * into the transition table of the starting state of this dist. future.
|
---|
| 923 | */
|
---|
| 924 | struct rx_inx future_frame;
|
---|
| 925 |
|
---|
| 926 | struct rx_inx side_effects_frame;
|
---|
| 927 | struct rx_se_list * effects;
|
---|
| 928 | };
|
---|
| 929 |
|
---|
| 930 | #define rx_lock_superstate(R,S) ((S)->locks++)
|
---|
| 931 | #define rx_unlock_superstate(R,S) (--(S)->locks)
|
---|
| 932 |
|
---|
| 933 | |
---|
| 934 |
|
---|
| 935 | /* This page destined for rx.h */
|
---|
| 936 |
|
---|
| 937 | struct rx_blocklist
|
---|
| 938 | {
|
---|
| 939 | struct rx_blocklist * next;
|
---|
| 940 | int bytes;
|
---|
| 941 | };
|
---|
| 942 |
|
---|
| 943 | struct rx_freelist
|
---|
| 944 | {
|
---|
| 945 | struct rx_freelist * next;
|
---|
| 946 | };
|
---|
| 947 |
|
---|
| 948 | struct rx_cache;
|
---|
| 949 |
|
---|
| 950 | #ifdef __STDC__
|
---|
| 951 | typedef void (*rx_morecore_fn)(struct rx_cache *);
|
---|
| 952 | #else
|
---|
| 953 | typedef void (*rx_morecore_fn)();
|
---|
| 954 | #endif
|
---|
| 955 |
|
---|
| 956 | /* You use this to control the allocation of superstate data
|
---|
| 957 | * during matching. Most of it should be initialized to 0.
|
---|
| 958 | *
|
---|
| 959 | * A MORECORE function is necessary. It should allocate
|
---|
| 960 | * a new block of memory or return 0.
|
---|
| 961 | * A default that uses malloc is called `rx_morecore'.
|
---|
| 962 | *
|
---|
| 963 | * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
|
---|
| 964 | * the system will try to allocate. The default is 128. Batch style
|
---|
| 965 | * applications that are very regexp intensive should use as high a number
|
---|
| 966 | * as possible without thrashing.
|
---|
| 967 | *
|
---|
| 968 | * The LOCAL_CSET_SIZE is the number of characters in a character set.
|
---|
| 969 | * It is therefore the number of entries in a superstate transition table.
|
---|
| 970 | * Generally, it should be 256. If your character set has 16 bits,
|
---|
| 971 | * it is better to translate your regexps into equivalent 8 bit patterns.
|
---|
| 972 | */
|
---|
| 973 |
|
---|
| 974 | struct rx_cache
|
---|
| 975 | {
|
---|
| 976 | struct rx_hash_rules superset_hash_rules;
|
---|
| 977 |
|
---|
| 978 | /* Objects are allocated by incrementing a pointer that
|
---|
| 979 | * scans across rx_blocklists.
|
---|
| 980 | */
|
---|
| 981 | struct rx_blocklist * memory;
|
---|
| 982 | struct rx_blocklist * memory_pos;
|
---|
| 983 | int bytes_left;
|
---|
| 984 | char * memory_addr;
|
---|
| 985 | rx_morecore_fn morecore;
|
---|
| 986 |
|
---|
| 987 | /* Freelists. */
|
---|
| 988 | struct rx_freelist * free_superstates;
|
---|
| 989 | struct rx_freelist * free_transition_classes;
|
---|
| 990 | struct rx_freelist * free_discernable_futures;
|
---|
| 991 | struct rx_freelist * free_supersets;
|
---|
| 992 | struct rx_freelist * free_hash;
|
---|
| 993 |
|
---|
| 994 | /* Two sets of superstates -- those that are semifreed, and those
|
---|
| 995 | * that are being used.
|
---|
| 996 | */
|
---|
| 997 | struct rx_superstate * lru_superstate;
|
---|
| 998 | struct rx_superstate * semifree_superstate;
|
---|
| 999 |
|
---|
| 1000 | struct rx_superset * empty_superset;
|
---|
| 1001 |
|
---|
| 1002 | int superstates;
|
---|
| 1003 | int semifree_superstates;
|
---|
| 1004 | int hits;
|
---|
| 1005 | int misses;
|
---|
| 1006 | int superstates_allowed;
|
---|
| 1007 |
|
---|
| 1008 | int local_cset_size;
|
---|
| 1009 | void ** instruction_table;
|
---|
| 1010 |
|
---|
| 1011 | struct rx_hash superset_table;
|
---|
| 1012 | };
|
---|
| 1013 |
|
---|
| 1014 | |
---|
| 1015 |
|
---|
| 1016 |
|
---|
| 1017 | /* The lowest-level search function supports arbitrarily fragmented
|
---|
| 1018 | * strings and (optionally) suspendable/resumable searches.
|
---|
| 1019 | *
|
---|
| 1020 | * Callers have to provide a few hooks.
|
---|
| 1021 | */
|
---|
| 1022 |
|
---|
| 1023 | #ifndef __GNUC__
|
---|
| 1024 | #ifdef __STDC__
|
---|
| 1025 | #define __const__ const
|
---|
| 1026 | #else
|
---|
| 1027 | #define __const__
|
---|
| 1028 | #endif
|
---|
| 1029 | #endif
|
---|
| 1030 |
|
---|
| 1031 | /* This holds a matcher position */
|
---|
| 1032 | struct rx_string_position
|
---|
| 1033 | {
|
---|
| 1034 | __const__ unsigned char * pos; /* The current pos. */
|
---|
| 1035 | __const__ unsigned char * string; /* The current string burst. */
|
---|
| 1036 | __const__ unsigned char * end; /* First invalid position >= POS. */
|
---|
| 1037 | int offset; /* Integer address of the current burst. */
|
---|
| 1038 | int size; /* Current string's size. */
|
---|
| 1039 | int search_direction; /* 1 or -1 */
|
---|
| 1040 | int search_end; /* First position to not try. */
|
---|
| 1041 | };
|
---|
| 1042 |
|
---|
| 1043 |
|
---|
| 1044 | enum rx_get_burst_return
|
---|
| 1045 | {
|
---|
| 1046 | rx_get_burst_continuation,
|
---|
| 1047 | rx_get_burst_error,
|
---|
| 1048 | rx_get_burst_ok,
|
---|
| 1049 | rx_get_burst_no_more
|
---|
| 1050 | };
|
---|
| 1051 |
|
---|
| 1052 |
|
---|
| 1053 | /* A call to get burst should make POS valid. It might be invalid
|
---|
| 1054 | * if the STRING field doesn't point to a burst that actually
|
---|
| 1055 | * contains POS.
|
---|
| 1056 | *
|
---|
| 1057 | * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to
|
---|
| 1058 | * whether or not to pad to the left. Padding to the right is always
|
---|
| 1059 | * appropriate, but need not go past the point indicated by STOP.
|
---|
| 1060 | *
|
---|
| 1061 | * If a continuation is returned, then the reentering call to
|
---|
| 1062 | * a search function will retry the get_burst.
|
---|
| 1063 | */
|
---|
| 1064 |
|
---|
| 1065 | #ifdef __STDC__
|
---|
| 1066 | typedef enum rx_get_burst_return
|
---|
| 1067 | (*rx_get_burst_fn) (struct rx_string_position * pos,
|
---|
| 1068 | void * app_closure,
|
---|
| 1069 | int stop);
|
---|
| 1070 |
|
---|
| 1071 | #else
|
---|
| 1072 | typedef enum rx_get_burst_return (*rx_get_burst_fn) ();
|
---|
| 1073 | #endif
|
---|
| 1074 |
|
---|
| 1075 |
|
---|
| 1076 | enum rx_back_check_return
|
---|
| 1077 | {
|
---|
| 1078 | rx_back_check_continuation,
|
---|
| 1079 | rx_back_check_error,
|
---|
| 1080 | rx_back_check_pass,
|
---|
| 1081 | rx_back_check_fail
|
---|
| 1082 | };
|
---|
| 1083 |
|
---|
| 1084 | /* Back_check should advance the position it is passed
|
---|
| 1085 | * over rparen - lparen characters and return pass iff
|
---|
| 1086 | * the characters starting at POS match those indexed
|
---|
| 1087 | * by [LPAREN..RPAREN].
|
---|
| 1088 | *
|
---|
| 1089 | * If a continuation is returned, then the reentering call to
|
---|
| 1090 | * a search function will retry the back_check.
|
---|
| 1091 | */
|
---|
| 1092 |
|
---|
| 1093 | #ifdef __STDC__
|
---|
| 1094 | typedef enum rx_back_check_return
|
---|
| 1095 | (*rx_back_check_fn) (struct rx_string_position * pos,
|
---|
| 1096 | int lparen,
|
---|
| 1097 | int rparen,
|
---|
| 1098 | unsigned char * translate,
|
---|
| 1099 | void * app_closure,
|
---|
| 1100 | int stop);
|
---|
| 1101 |
|
---|
| 1102 | #else
|
---|
| 1103 | typedef enum rx_back_check_return (*rx_back_check_fn) ();
|
---|
| 1104 | #endif
|
---|
| 1105 |
|
---|
| 1106 |
|
---|
| 1107 |
|
---|
| 1108 |
|
---|
| 1109 | /* A call to fetch_char should return the character at POS or POS + 1.
|
---|
| 1110 | * Returning continuations here isn't supported. OFFSET is either 0 or 1
|
---|
| 1111 | * and indicates which characters is desired.
|
---|
| 1112 | */
|
---|
| 1113 |
|
---|
| 1114 | #ifdef __STDC__
|
---|
| 1115 | typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos,
|
---|
| 1116 | int offset,
|
---|
| 1117 | void * app_closure,
|
---|
| 1118 | int stop);
|
---|
| 1119 | #else
|
---|
| 1120 | typedef int (*rx_fetch_char_fn) ();
|
---|
| 1121 | #endif
|
---|
| 1122 |
|
---|
| 1123 |
|
---|
| 1124 | enum rx_search_return
|
---|
| 1125 | {
|
---|
| 1126 | rx_search_continuation = -4,
|
---|
| 1127 | rx_search_error = -3,
|
---|
| 1128 | rx_search_soft_fail = -2, /* failed by running out of string */
|
---|
| 1129 | rx_search_fail = -1 /* failed only by reaching failure states */
|
---|
| 1130 | /* return values >= 0 indicate the position of a successful match */
|
---|
| 1131 | };
|
---|
| 1132 |
|
---|
| 1133 |
|
---|
| 1134 |
|
---|
| 1135 |
|
---|
| 1136 | |
---|
| 1137 |
|
---|
| 1138 |
|
---|
| 1139 | /* regex.h
|
---|
| 1140 | *
|
---|
| 1141 | * The remaining declarations replace regex.h.
|
---|
| 1142 | */
|
---|
| 1143 |
|
---|
| 1144 | /* This is an array of error messages corresponding to the error codes.
|
---|
| 1145 | */
|
---|
| 1146 | extern __const__ char *re_error_msg[];
|
---|
| 1147 |
|
---|
| 1148 | /* If any error codes are removed, changed, or added, update the
|
---|
| 1149 | `re_error_msg' table in regex.c. */
|
---|
| 1150 | typedef enum
|
---|
| 1151 | {
|
---|
| 1152 | REG_NOERROR = 0, /* Success. */
|
---|
| 1153 | REG_NOMATCH, /* Didn't find a match (for regexec). */
|
---|
| 1154 |
|
---|
| 1155 | /* POSIX regcomp return error codes. (In the order listed in the
|
---|
| 1156 | standard.) */
|
---|
| 1157 | REG_BADPAT, /* Invalid pattern. */
|
---|
| 1158 | REG_ECOLLATE, /* Not implemented. */
|
---|
| 1159 | REG_ECTYPE, /* Invalid character class name. */
|
---|
| 1160 | REG_EESCAPE, /* Trailing backslash. */
|
---|
| 1161 | REG_ESUBREG, /* Invalid back reference. */
|
---|
| 1162 | REG_EBRACK, /* Unmatched left bracket. */
|
---|
| 1163 | REG_EPAREN, /* Parenthesis imbalance. */
|
---|
| 1164 | REG_EBRACE, /* Unmatched \{. */
|
---|
| 1165 | REG_BADBR, /* Invalid contents of \{\}. */
|
---|
| 1166 | REG_ERANGE, /* Invalid range end. */
|
---|
| 1167 | REG_ESPACE, /* Ran out of memory. */
|
---|
| 1168 | REG_BADRPT, /* No preceding re for repetition op. */
|
---|
| 1169 |
|
---|
| 1170 | /* Error codes we've added. */
|
---|
| 1171 | REG_EEND, /* Premature end. */
|
---|
| 1172 | REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
|
---|
| 1173 | REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
|
---|
| 1174 | } reg_errcode_t;
|
---|
| 1175 |
|
---|
| 1176 | /* The regex.c support, as a client of rx, defines a set of possible
|
---|
| 1177 | * side effects that can be added to the edge lables of nfa edges.
|
---|
| 1178 | * Here is the list of sidef effects in use.
|
---|
| 1179 | */
|
---|
| 1180 |
|
---|
| 1181 | enum re_side_effects
|
---|
| 1182 | {
|
---|
| 1183 | #define RX_WANT_SE_DEFS 1
|
---|
| 1184 | #undef RX_DEF_SE
|
---|
| 1185 | #undef RX_DEF_CPLX_SE
|
---|
| 1186 | #define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE,
|
---|
| 1187 | #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE,
|
---|
| 1188 | #include "rx.h"
|
---|
| 1189 | #undef RX_DEF_SE
|
---|
| 1190 | #undef RX_DEF_CPLX_SE
|
---|
| 1191 | #undef RX_WANT_SE_DEFS
|
---|
| 1192 | re_floogle_flap = 65533
|
---|
| 1193 | };
|
---|
| 1194 |
|
---|
| 1195 | /* These hold paramaters for the kinds of side effects that are possible
|
---|
| 1196 | * in the supported pattern languages. These include things like the
|
---|
| 1197 | * numeric bounds of {} operators and the index of paren registers for
|
---|
| 1198 | * subexpression measurement or backreferencing.
|
---|
| 1199 | */
|
---|
| 1200 | struct re_se_params
|
---|
| 1201 | {
|
---|
| 1202 | enum re_side_effects se;
|
---|
| 1203 | int op1;
|
---|
| 1204 | int op2;
|
---|
| 1205 | };
|
---|
| 1206 |
|
---|
| 1207 | typedef unsigned reg_syntax_t;
|
---|
| 1208 |
|
---|
| 1209 | struct re_pattern_buffer
|
---|
| 1210 | {
|
---|
| 1211 | struct rx rx;
|
---|
| 1212 | reg_syntax_t syntax; /* See below for syntax bit definitions. */
|
---|
| 1213 |
|
---|
| 1214 | unsigned int no_sub:1; /* If set, don't return register offsets. */
|
---|
| 1215 | unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */
|
---|
| 1216 | unsigned int not_eol:1; /* match at the ends of the string. */
|
---|
| 1217 | unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/
|
---|
| 1218 | unsigned int least_subs:1; /* If set, and returning registers, return
|
---|
| 1219 | * as few values as possible. Only
|
---|
| 1220 | * backreferenced groups and group 0 (the whole
|
---|
| 1221 | * match) will be returned.
|
---|
| 1222 | */
|
---|
| 1223 |
|
---|
| 1224 | /* If true, this says that the matcher should keep registers on its
|
---|
| 1225 | * backtracking stack. For many patterns, we can easily determine that
|
---|
| 1226 | * this isn't necessary.
|
---|
| 1227 | */
|
---|
| 1228 | unsigned int match_regs_on_stack:1;
|
---|
| 1229 | unsigned int search_regs_on_stack:1;
|
---|
| 1230 |
|
---|
| 1231 | /* is_anchored and begbuf_only are filled in by rx_compile. */
|
---|
| 1232 | unsigned int is_anchored:1; /* Anchorded by ^? */
|
---|
| 1233 | unsigned int begbuf_only:1; /* Anchored to char position 0? */
|
---|
| 1234 |
|
---|
| 1235 |
|
---|
| 1236 | /* If REGS_UNALLOCATED, allocate space in the `regs' structure
|
---|
| 1237 | * for `max (RE_NREGS, re_nsub + 1)' groups.
|
---|
| 1238 | * If REGS_REALLOCATE, reallocate space if necessary.
|
---|
| 1239 | * If REGS_FIXED, use what's there.
|
---|
| 1240 | */
|
---|
| 1241 | #define REGS_UNALLOCATED 0
|
---|
| 1242 | #define REGS_REALLOCATE 1
|
---|
| 1243 | #define REGS_FIXED 2
|
---|
| 1244 | unsigned int regs_allocated:2;
|
---|
| 1245 |
|
---|
| 1246 |
|
---|
| 1247 | /* Either a translate table to apply to all characters before
|
---|
| 1248 | * comparing them, or zero for no translation. The translation
|
---|
| 1249 | * is applied to a pattern when it is compiled and to a string
|
---|
| 1250 | * when it is matched.
|
---|
| 1251 | */
|
---|
| 1252 | unsigned char * translate;
|
---|
| 1253 |
|
---|
| 1254 | /* If this is a valid pointer, it tells rx not to store the extents of
|
---|
| 1255 | * certain subexpressions (those corresponding to non-zero entries).
|
---|
| 1256 | * Passing 0x1 is the same as passing an array of all ones. Passing 0x0
|
---|
| 1257 | * is the same as passing an array of all zeros.
|
---|
| 1258 | * The array should contain as many entries as their are subexps in the
|
---|
[23508] | 1259 | * regexp.
|
---|
[3745] | 1260 | *
|
---|
| 1261 | * For POSIX compatability, when using regcomp and regexec this field
|
---|
| 1262 | * is zeroed and ignored.
|
---|
| 1263 | */
|
---|
| 1264 | char * syntax_parens;
|
---|
| 1265 |
|
---|
| 1266 | /* Number of subexpressions found by the compiler. */
|
---|
| 1267 | size_t re_nsub;
|
---|
| 1268 |
|
---|
| 1269 | void * buffer; /* Malloced memory for the nfa. */
|
---|
| 1270 | mg_u_long allocated; /* Size of that memory. */
|
---|
| 1271 |
|
---|
| 1272 | /* Pointer to a fastmap, if any, otherwise zero. re_search uses
|
---|
| 1273 | * the fastmap, if there is one, to skip over impossible
|
---|
| 1274 | * starting points for matches. */
|
---|
| 1275 | char *fastmap;
|
---|
| 1276 |
|
---|
| 1277 | unsigned int fastmap_accurate:1; /* These three are internal. */
|
---|
| 1278 | unsigned int can_match_empty:1;
|
---|
| 1279 | struct rx_nfa_state * start; /* The nfa starting state. */
|
---|
| 1280 |
|
---|
| 1281 | /* This is the list of iterator bounds for {lo,hi} constructs.
|
---|
| 1282 | * The memory pointed to is part of the rx->buffer.
|
---|
| 1283 | */
|
---|
| 1284 | struct re_se_params *se_params;
|
---|
| 1285 |
|
---|
| 1286 | /* This is a bitset representation of the fastmap.
|
---|
| 1287 | * This is a true fastmap that already takes the translate
|
---|
| 1288 | * table into account.
|
---|
| 1289 | */
|
---|
| 1290 | rx_Bitset fastset;
|
---|
| 1291 | };
|
---|
| 1292 |
|
---|
| 1293 | /* Type for byte offsets within the string. POSIX mandates this. */
|
---|
| 1294 | typedef int regoff_t;
|
---|
| 1295 |
|
---|
| 1296 | /* This is the structure we store register match data in. See
|
---|
| 1297 | regex.texinfo for a full description of what registers match. */
|
---|
| 1298 | struct re_registers
|
---|
| 1299 | {
|
---|
| 1300 | unsigned num_regs;
|
---|
| 1301 | regoff_t *start;
|
---|
| 1302 | regoff_t *end;
|
---|
| 1303 | };
|
---|
| 1304 |
|
---|
| 1305 | typedef struct re_pattern_buffer regex_t;
|
---|
| 1306 |
|
---|
| 1307 | /* POSIX specification for registers. Aside from the different names than
|
---|
| 1308 | `re_registers', POSIX uses an array of structures, instead of a
|
---|
| 1309 | structure of arrays. */
|
---|
| 1310 | typedef struct
|
---|
| 1311 | {
|
---|
| 1312 | regoff_t rm_so; /* Byte offset from string's start to substring's start. */
|
---|
| 1313 | regoff_t rm_eo; /* Byte offset from string's start to substring's end. */
|
---|
| 1314 | } regmatch_t;
|
---|
| 1315 |
|
---|
| 1316 | |
---|
| 1317 |
|
---|
| 1318 | /* The following bits are used to determine the regexp syntax we
|
---|
| 1319 | recognize. The set/not-set meanings are chosen so that Emacs syntax
|
---|
| 1320 | remains the value 0. The bits are given in alphabetical order, and
|
---|
| 1321 | the definitions shifted by one from the previous bit; thus, when we
|
---|
| 1322 | add or remove a bit, only one other definition need change. */
|
---|
| 1323 |
|
---|
| 1324 | /* If this bit is not set, then \ inside a bracket expression is literal.
|
---|
| 1325 | If set, then such a \ quotes the following character. */
|
---|
| 1326 | #define RE_BACKSLASH_ESCAPE_IN_LISTS (1)
|
---|
| 1327 |
|
---|
| 1328 | /* If this bit is not set, then + and ? are operators, and \+ and \? are
|
---|
| 1329 | literals.
|
---|
| 1330 | If set, then \+ and \? are operators and + and ? are literals. */
|
---|
| 1331 | #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
|
---|
| 1332 |
|
---|
| 1333 | /* If this bit is set, then character classes are supported. They are:
|
---|
| 1334 | [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
|
---|
| 1335 | [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
|
---|
| 1336 | If not set, then character classes are not supported. */
|
---|
| 1337 | #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
|
---|
| 1338 |
|
---|
| 1339 | /* If this bit is set, then ^ and $ are always anchors (outside bracket
|
---|
| 1340 | expressions, of course).
|
---|
| 1341 | If this bit is not set, then it depends:
|
---|
| 1342 | ^ is an anchor if it is at the beginning of a regular
|
---|
| 1343 | expression or after an open-group or an alternation operator;
|
---|
| 1344 | $ is an anchor if it is at the end of a regular expression, or
|
---|
| 1345 | before a close-group or an alternation operator.
|
---|
| 1346 |
|
---|
| 1347 | This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
|
---|
| 1348 | POSIX draft 11.2 says that * etc. in leading positions is undefined.
|
---|
| 1349 | We already implemented a previous draft which made those constructs
|
---|
| 1350 | invalid, though, so we haven't changed the code back. */
|
---|
| 1351 | #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
|
---|
| 1352 |
|
---|
| 1353 | /* If this bit is set, then special characters are always special
|
---|
| 1354 | regardless of where they are in the pattern.
|
---|
| 1355 | If this bit is not set, then special characters are special only in
|
---|
| 1356 | some contexts; otherwise they are ordinary. Specifically,
|
---|
| 1357 | * + ? and intervals are only special when not after the beginning,
|
---|
| 1358 | open-group, or alternation operator. */
|
---|
| 1359 | #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
|
---|
| 1360 |
|
---|
| 1361 | /* If this bit is set, then *, +, ?, and { cannot be first in an re or
|
---|
| 1362 | immediately after an alternation or begin-group operator. */
|
---|
| 1363 | #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
|
---|
| 1364 |
|
---|
| 1365 | /* If this bit is set, then . matches newline.
|
---|
| 1366 | If not set, then it doesn't. */
|
---|
| 1367 | #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
|
---|
| 1368 |
|
---|
| 1369 | /* If this bit is set, then . doesn't match NUL.
|
---|
| 1370 | If not set, then it does. */
|
---|
| 1371 | #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
|
---|
| 1372 |
|
---|
| 1373 | /* If this bit is set, nonmatching lists [^...] do not match newline.
|
---|
| 1374 | If not set, they do. */
|
---|
| 1375 | #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
|
---|
| 1376 |
|
---|
| 1377 | /* If this bit is set, either \{...\} or {...} defines an
|
---|
| 1378 | interval, depending on RE_NO_BK_BRACES.
|
---|
| 1379 | If not set, \{, \}, {, and } are literals. */
|
---|
| 1380 | #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
|
---|
| 1381 |
|
---|
| 1382 | /* If this bit is set, +, ? and | aren't recognized as operators.
|
---|
| 1383 | If not set, they are. */
|
---|
| 1384 | #define RE_LIMITED_OPS (RE_INTERVALS << 1)
|
---|
| 1385 |
|
---|
| 1386 | /* If this bit is set, newline is an alternation operator.
|
---|
| 1387 | If not set, newline is literal. */
|
---|
| 1388 | #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
|
---|
| 1389 |
|
---|
| 1390 | /* If this bit is set, then `{...}' defines an interval, and \{ and \}
|
---|
| 1391 | are literals.
|
---|
| 1392 | If not set, then `\{...\}' defines an interval. */
|
---|
| 1393 | #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
|
---|
| 1394 |
|
---|
| 1395 | /* If this bit is set, (...) defines a group, and \( and \) are literals.
|
---|
| 1396 | If not set, \(...\) defines a group, and ( and ) are literals. */
|
---|
| 1397 | #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
|
---|
| 1398 |
|
---|
| 1399 | /* If this bit is set, then \<digit> matches <digit>.
|
---|
| 1400 | If not set, then \<digit> is a back-reference. */
|
---|
| 1401 | #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
|
---|
| 1402 |
|
---|
| 1403 | /* If this bit is set, then | is an alternation operator, and \| is literal.
|
---|
| 1404 | If not set, then \| is an alternation operator, and | is literal. */
|
---|
| 1405 | #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
|
---|
| 1406 |
|
---|
| 1407 | /* If this bit is set, then an ending range point collating higher
|
---|
| 1408 | than the starting range point, as in [z-a], is invalid.
|
---|
| 1409 | If not set, then when ending range point collates higher than the
|
---|
| 1410 | starting range point, the range is ignored. */
|
---|
| 1411 | #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
|
---|
| 1412 |
|
---|
| 1413 | /* If this bit is set, then an unmatched ) is ordinary.
|
---|
| 1414 | If not set, then an unmatched ) is invalid. */
|
---|
| 1415 | #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
|
---|
| 1416 |
|
---|
| 1417 | /* This global variable defines the particular regexp syntax to use (for
|
---|
| 1418 | some interfaces). When a regexp is compiled, the syntax used is
|
---|
| 1419 | stored in the pattern buffer, so changing this does not affect
|
---|
| 1420 | already-compiled regexps. */
|
---|
| 1421 | extern reg_syntax_t re_syntax_options;
|
---|
| 1422 | |
---|
| 1423 |
|
---|
| 1424 | /* Define combinations of the above bits for the standard possibilities.
|
---|
| 1425 | (The [[[ comments delimit what gets put into the Texinfo file, so
|
---|
| 1426 | don't delete them!) */
|
---|
| 1427 | /* [[[begin syntaxes]]] */
|
---|
| 1428 | #define RE_SYNTAX_EMACS 0
|
---|
| 1429 |
|
---|
| 1430 | #define RE_SYNTAX_AWK \
|
---|
| 1431 | (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
|
---|
| 1432 | | RE_NO_BK_PARENS | RE_NO_BK_REFS \
|
---|
| 1433 | | RE_NO_BK_VAR | RE_NO_EMPTY_RANGES \
|
---|
| 1434 | | RE_UNMATCHED_RIGHT_PAREN_ORD)
|
---|
| 1435 |
|
---|
| 1436 | #define RE_SYNTAX_POSIX_AWK \
|
---|
| 1437 | (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
|
---|
| 1438 |
|
---|
| 1439 | #define RE_SYNTAX_GREP \
|
---|
| 1440 | (RE_BK_PLUS_QM | RE_CHAR_CLASSES \
|
---|
| 1441 | | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \
|
---|
| 1442 | | RE_NEWLINE_ALT)
|
---|
| 1443 |
|
---|
| 1444 | #define RE_SYNTAX_EGREP \
|
---|
| 1445 | (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \
|
---|
| 1446 | | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \
|
---|
| 1447 | | RE_NEWLINE_ALT | RE_NO_BK_PARENS \
|
---|
| 1448 | | RE_NO_BK_VBAR)
|
---|
| 1449 |
|
---|
| 1450 | #define RE_SYNTAX_POSIX_EGREP \
|
---|
| 1451 | (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
|
---|
| 1452 |
|
---|
| 1453 | #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
|
---|
| 1454 |
|
---|
| 1455 | /* Syntax bits common to both basic and extended POSIX regex syntax. */
|
---|
| 1456 | #define _RE_SYNTAX_POSIX_COMMON \
|
---|
| 1457 | (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
|
---|
| 1458 | | RE_INTERVALS | RE_NO_EMPTY_RANGES)
|
---|
| 1459 |
|
---|
| 1460 | #define RE_SYNTAX_POSIX_BASIC \
|
---|
| 1461 | (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
|
---|
| 1462 |
|
---|
| 1463 | /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
|
---|
| 1464 | RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
|
---|
| 1465 | isn't minimal, since other operators, such as \`, aren't disabled. */
|
---|
| 1466 | #define RE_SYNTAX_POSIX_MINIMAL_BASIC \
|
---|
| 1467 | (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
|
---|
| 1468 |
|
---|
| 1469 | #define RE_SYNTAX_POSIX_EXTENDED \
|
---|
| 1470 | (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
|
---|
| 1471 | | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
|
---|
| 1472 | | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
|
---|
| 1473 | | RE_UNMATCHED_RIGHT_PAREN_ORD)
|
---|
| 1474 |
|
---|
| 1475 | /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
|
---|
| 1476 | replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
|
---|
| 1477 | #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
|
---|
| 1478 | (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
|
---|
| 1479 | | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
|
---|
| 1480 | | RE_NO_BK_PARENS | RE_NO_BK_REFS \
|
---|
| 1481 | | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD)
|
---|
| 1482 | /* [[[end syntaxes]]] */
|
---|
| 1483 |
|
---|
| 1484 | /* Maximum number of duplicates an interval can allow. Some systems
|
---|
| 1485 | (erroneously) define this in other header files, but we want our
|
---|
| 1486 | value, so remove any previous define. */
|
---|
| 1487 | #ifdef RE_DUP_MAX
|
---|
| 1488 | #undef RE_DUP_MAX
|
---|
| 1489 | #endif
|
---|
| 1490 | #define RE_DUP_MAX ((1 << 15) - 1)
|
---|
| 1491 |
|
---|
| 1492 |
|
---|
| 1493 |
|
---|
| 1494 | /* POSIX `cflags' bits (i.e., information for `regcomp'). */
|
---|
| 1495 |
|
---|
| 1496 | /* If this bit is set, then use extended regular expression syntax.
|
---|
| 1497 | If not set, then use basic regular expression syntax. */
|
---|
| 1498 | #define REG_EXTENDED 1
|
---|
| 1499 |
|
---|
| 1500 | /* If this bit is set, then ignore case when matching.
|
---|
| 1501 | If not set, then case is significant. */
|
---|
| 1502 | #define REG_ICASE (REG_EXTENDED << 1)
|
---|
| 1503 |
|
---|
| 1504 | /* If this bit is set, then anchors do not match at newline
|
---|
| 1505 | characters in the string.
|
---|
| 1506 | If not set, then anchors do match at newlines. */
|
---|
| 1507 | #define REG_NEWLINE (REG_ICASE << 1)
|
---|
| 1508 |
|
---|
| 1509 | /* If this bit is set, then report only success or fail in regexec.
|
---|
| 1510 | If not set, then returns differ between not matching and errors. */
|
---|
| 1511 | #define REG_NOSUB (REG_NEWLINE << 1)
|
---|
| 1512 |
|
---|
| 1513 |
|
---|
| 1514 | /* POSIX `eflags' bits (i.e., information for regexec). */
|
---|
| 1515 |
|
---|
| 1516 | /* If this bit is set, then the beginning-of-line operator doesn't match
|
---|
| 1517 | the beginning of the string (presumably because it's not the
|
---|
| 1518 | beginning of a line).
|
---|
| 1519 | If not set, then the beginning-of-line operator does match the
|
---|
| 1520 | beginning of the string. */
|
---|
| 1521 | #define REG_NOTBOL 1
|
---|
| 1522 |
|
---|
| 1523 | /* Like REG_NOTBOL, except for the end-of-line. */
|
---|
| 1524 | #define REG_NOTEOL (1 << 1)
|
---|
| 1525 |
|
---|
| 1526 | /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
|
---|
| 1527 | * `re_match_2' returns information about at least this many registers
|
---|
| 1528 | * the first time a `regs' structure is passed.
|
---|
| 1529 | *
|
---|
| 1530 | * Also, this is the greatest number of backreferenced subexpressions
|
---|
| 1531 | * allowed in a pattern being matched without caller-supplied registers.
|
---|
| 1532 | */
|
---|
| 1533 | #ifndef RE_NREGS
|
---|
| 1534 | #define RE_NREGS 30
|
---|
| 1535 | #endif
|
---|
| 1536 |
|
---|
| 1537 | extern int rx_cache_bound;
|
---|
| 1538 | extern char rx_version_string[];
|
---|
| 1539 |
|
---|
| 1540 |
|
---|
| 1541 | |
---|
| 1542 |
|
---|
| 1543 | #ifdef RX_WANT_RX_DEFS
|
---|
| 1544 |
|
---|
| 1545 | /* This is decls to the interesting subsystems and lower layers
|
---|
| 1546 | * of rx. Everything which doesn't have a public counterpart in
|
---|
| 1547 | * regex.c is declared here.
|
---|
| 1548 | */
|
---|
| 1549 |
|
---|
| 1550 |
|
---|
| 1551 | #ifdef __STDC__
|
---|
| 1552 | typedef void (*rx_hash_freefn) (struct rx_hash_item * it);
|
---|
| 1553 | #else /* ndef __STDC__ */
|
---|
| 1554 | typedef void (*rx_hash_freefn) ();
|
---|
| 1555 | #endif /* ndef __STDC__ */
|
---|
| 1556 |
|
---|
| 1557 |
|
---|
| 1558 | |
---|
| 1559 |
|
---|
| 1560 |
|
---|
[23508] | 1561 | #ifdef __STDC__
|
---|
[3745] | 1562 | RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b);
|
---|
[23508] | 1563 | RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b);
|
---|
[3745] | 1564 | RX_DECL int rx_bitset_empty (int size, rx_Bitset set);
|
---|
| 1565 | RX_DECL void rx_bitset_null (int size, rx_Bitset b);
|
---|
| 1566 | RX_DECL void rx_bitset_universe (int size, rx_Bitset b);
|
---|
[23508] | 1567 | RX_DECL void rx_bitset_complement (int size, rx_Bitset b);
|
---|
[3745] | 1568 | RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b);
|
---|
| 1569 | RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b);
|
---|
| 1570 | RX_DECL void rx_bitset_intersection (int size,
|
---|
| 1571 | rx_Bitset a, rx_Bitset b);
|
---|
| 1572 | RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b);
|
---|
| 1573 | RX_DECL void rx_bitset_revdifference (int size,
|
---|
| 1574 | rx_Bitset a, rx_Bitset b);
|
---|
| 1575 | RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b);
|
---|
| 1576 | RX_DECL mg_u_long rx_bitset_hash (int size, rx_Bitset b);
|
---|
| 1577 | RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table,
|
---|
| 1578 | mg_u_long hash,
|
---|
| 1579 | void * value,
|
---|
| 1580 | struct rx_hash_rules * rules);
|
---|
| 1581 | RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table,
|
---|
| 1582 | mg_u_long hash,
|
---|
| 1583 | void * value,
|
---|
| 1584 | struct rx_hash_rules * rules);
|
---|
| 1585 | RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules);
|
---|
| 1586 | RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
|
---|
| 1587 | struct rx_hash_rules * rules);
|
---|
| 1588 | RX_DECL rx_Bitset rx_cset (struct rx *rx);
|
---|
| 1589 | RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a);
|
---|
| 1590 | RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c);
|
---|
| 1591 | RX_DECL struct rexp_node * rexp_node (struct rx *rx,
|
---|
| 1592 | enum rexp_node_type type);
|
---|
| 1593 | RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx,
|
---|
| 1594 | rx_Bitset b);
|
---|
| 1595 | RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx,
|
---|
| 1596 | struct rexp_node * a,
|
---|
| 1597 | struct rexp_node * b);
|
---|
| 1598 | RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx,
|
---|
| 1599 | struct rexp_node * a,
|
---|
| 1600 | struct rexp_node * b);
|
---|
| 1601 | RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx,
|
---|
| 1602 | struct rexp_node * a);
|
---|
| 1603 | RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx,
|
---|
| 1604 | struct rexp_node * a);
|
---|
| 1605 | RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx,
|
---|
| 1606 | struct rexp_node * a,
|
---|
| 1607 | struct rexp_node * b);
|
---|
| 1608 | RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx,
|
---|
| 1609 | rx_side_effect a);
|
---|
| 1610 | RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx,
|
---|
| 1611 | void * a);
|
---|
| 1612 | RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node);
|
---|
| 1613 | RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx,
|
---|
| 1614 | struct rexp_node *node);
|
---|
| 1615 | RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx);
|
---|
| 1616 | RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n);
|
---|
| 1617 | RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx,
|
---|
[23508] | 1618 | int id);
|
---|
[3745] | 1619 | RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx,
|
---|
| 1620 | enum rx_nfa_etype type,
|
---|
| 1621 | struct rx_nfa_state *start,
|
---|
| 1622 | struct rx_nfa_state *dest);
|
---|
| 1623 | RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e);
|
---|
| 1624 | RX_DECL void rx_free_nfa (struct rx *rx);
|
---|
| 1625 | RX_DECL int rx_build_nfa (struct rx *rx,
|
---|
| 1626 | struct rexp_node *rexp,
|
---|
| 1627 | struct rx_nfa_state **start,
|
---|
| 1628 | struct rx_nfa_state **end);
|
---|
| 1629 | RX_DECL void rx_name_nfa_states (struct rx *rx);
|
---|
| 1630 | RX_DECL int rx_eclose_nfa (struct rx *rx);
|
---|
| 1631 | RX_DECL void rx_delete_epsilon_transitions (struct rx *rx);
|
---|
| 1632 | RX_DECL int rx_compactify_nfa (struct rx *rx,
|
---|
| 1633 | void **mem, mg_u_long *size);
|
---|
| 1634 | RX_DECL void rx_release_superset (struct rx *rx,
|
---|
| 1635 | struct rx_superset *set);
|
---|
| 1636 | RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx,
|
---|
| 1637 | struct rx_nfa_state *car, struct rx_superset *cdr);
|
---|
| 1638 | RX_DECL struct rx_superset * rx_superstate_eclosure_union
|
---|
| 1639 | (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl);
|
---|
| 1640 | RX_DECL struct rx_superstate * rx_superstate (struct rx *rx,
|
---|
| 1641 | struct rx_superset *set);
|
---|
| 1642 | RX_DECL struct rx_inx * rx_handle_cache_miss
|
---|
| 1643 | (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data);
|
---|
| 1644 | RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size,
|
---|
| 1645 | reg_syntax_t syntax,
|
---|
[23508] | 1646 | struct re_pattern_buffer * rxb);
|
---|
[3745] | 1647 | RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb);
|
---|
| 1648 | #else /* STDC */
|
---|
| 1649 | RX_DECL int rx_bitset_is_equal ();
|
---|
| 1650 | RX_DECL int rx_bitset_is_subset ();
|
---|
| 1651 | RX_DECL int rx_bitset_empty ();
|
---|
| 1652 | RX_DECL void rx_bitset_null ();
|
---|
| 1653 | RX_DECL void rx_bitset_universe ();
|
---|
| 1654 | RX_DECL void rx_bitset_complement ();
|
---|
| 1655 | RX_DECL void rx_bitset_assign ();
|
---|
| 1656 | RX_DECL void rx_bitset_union ();
|
---|
| 1657 | RX_DECL void rx_bitset_intersection ();
|
---|
| 1658 | RX_DECL void rx_bitset_difference ();
|
---|
| 1659 | RX_DECL void rx_bitset_revdifference ();
|
---|
| 1660 | RX_DECL void rx_bitset_xor ();
|
---|
| 1661 | RX_DECL mg_u_long rx_bitset_hash ();
|
---|
| 1662 | RX_DECL struct rx_hash_item * rx_hash_find ();
|
---|
| 1663 | RX_DECL struct rx_hash_item * rx_hash_store ();
|
---|
| 1664 | RX_DECL void rx_hash_free ();
|
---|
| 1665 | RX_DECL void rx_free_hash_table ();
|
---|
| 1666 | RX_DECL rx_Bitset rx_cset ();
|
---|
| 1667 | RX_DECL rx_Bitset rx_copy_cset ();
|
---|
| 1668 | RX_DECL void rx_free_cset ();
|
---|
| 1669 | RX_DECL struct rexp_node * rexp_node ();
|
---|
| 1670 | RX_DECL struct rexp_node * rx_mk_r_cset ();
|
---|
| 1671 | RX_DECL struct rexp_node * rx_mk_r_concat ();
|
---|
| 1672 | RX_DECL struct rexp_node * rx_mk_r_alternate ();
|
---|
| 1673 | RX_DECL struct rexp_node * rx_mk_r_opt ();
|
---|
| 1674 | RX_DECL struct rexp_node * rx_mk_r_star ();
|
---|
| 1675 | RX_DECL struct rexp_node * rx_mk_r_2phase_star ();
|
---|
| 1676 | RX_DECL struct rexp_node * rx_mk_r_side_effect ();
|
---|
| 1677 | RX_DECL struct rexp_node * rx_mk_r_data ();
|
---|
| 1678 | RX_DECL void rx_free_rexp ();
|
---|
| 1679 | RX_DECL struct rexp_node * rx_copy_rexp ();
|
---|
| 1680 | RX_DECL struct rx_nfa_state * rx_nfa_state ();
|
---|
| 1681 | RX_DECL void rx_free_nfa_state ();
|
---|
| 1682 | RX_DECL struct rx_nfa_state * rx_id_to_nfa_state ();
|
---|
| 1683 | RX_DECL struct rx_nfa_edge * rx_nfa_edge ();
|
---|
| 1684 | RX_DECL void rx_free_nfa_edge ();
|
---|
| 1685 | RX_DECL void rx_free_nfa ();
|
---|
| 1686 | RX_DECL int rx_build_nfa ();
|
---|
| 1687 | RX_DECL void rx_name_nfa_states ();
|
---|
| 1688 | RX_DECL int rx_eclose_nfa ();
|
---|
| 1689 | RX_DECL void rx_delete_epsilon_transitions ();
|
---|
| 1690 | RX_DECL int rx_compactify_nfa ();
|
---|
| 1691 | RX_DECL void rx_release_superset ();
|
---|
| 1692 | RX_DECL struct rx_superset * rx_superset_cons ();
|
---|
| 1693 | RX_DECL struct rx_superset * rx_superstate_eclosure_union ();
|
---|
| 1694 | RX_DECL struct rx_superstate * rx_superstate ();
|
---|
| 1695 | RX_DECL struct rx_inx * rx_handle_cache_miss ();
|
---|
| 1696 | RX_DECL reg_errcode_t rx_compile ();
|
---|
| 1697 | RX_DECL void rx_blow_up_fastmap ();
|
---|
| 1698 | #endif /* STDC */
|
---|
| 1699 |
|
---|
| 1700 |
|
---|
| 1701 | #endif /* RX_WANT_RX_DEFS */
|
---|
| 1702 |
|
---|
| 1703 |
|
---|
| 1704 | |
---|
| 1705 |
|
---|
| 1706 | #ifdef __STDC__
|
---|
| 1707 | extern int re_search_2 (struct re_pattern_buffer *rxb,
|
---|
| 1708 | __const__ char * string1, int size1,
|
---|
| 1709 | __const__ char * string2, int size2,
|
---|
| 1710 | int startpos, int range,
|
---|
| 1711 | struct re_registers *regs,
|
---|
| 1712 | int stop);
|
---|
| 1713 | extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string,
|
---|
| 1714 | int size, int startpos, int range,
|
---|
| 1715 | struct re_registers *regs);
|
---|
| 1716 | extern int re_match_2 (struct re_pattern_buffer * rxb,
|
---|
| 1717 | __const__ char * string1, int size1,
|
---|
| 1718 | __const__ char * string2, int size2,
|
---|
| 1719 | int pos, struct re_registers *regs, int stop);
|
---|
| 1720 | extern int re_match (struct re_pattern_buffer * rxb,
|
---|
| 1721 | __const__ char * string,
|
---|
| 1722 | int size, int pos,
|
---|
| 1723 | struct re_registers *regs);
|
---|
| 1724 | extern reg_syntax_t re_set_syntax (reg_syntax_t syntax);
|
---|
| 1725 | extern void re_set_registers (struct re_pattern_buffer *bufp,
|
---|
| 1726 | struct re_registers *regs,
|
---|
| 1727 | unsigned num_regs,
|
---|
| 1728 | regoff_t * starts, regoff_t * ends);
|
---|
| 1729 | extern __const__ char * re_compile_pattern (__const__ char *pattern,
|
---|
| 1730 | int length,
|
---|
| 1731 | struct re_pattern_buffer * rxb);
|
---|
| 1732 | extern int re_compile_fastmap (struct re_pattern_buffer * rxb);
|
---|
| 1733 | extern char * re_comp (__const__ char *s);
|
---|
| 1734 | extern int re_exec (__const__ char *s);
|
---|
| 1735 | extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags);
|
---|
| 1736 | extern int regexec (__const__ regex_t *preg, __const__ char *string,
|
---|
| 1737 | size_t nmatch, regmatch_t pmatch[],
|
---|
| 1738 | int eflags);
|
---|
| 1739 | extern size_t regerror (int errcode, __const__ regex_t *preg,
|
---|
| 1740 | char *errbuf, size_t errbuf_size);
|
---|
| 1741 | extern void regfree (regex_t *preg);
|
---|
| 1742 |
|
---|
| 1743 | #else /* STDC */
|
---|
| 1744 | extern int re_search_2 ();
|
---|
| 1745 | extern int re_search ();
|
---|
| 1746 | extern int re_match_2 ();
|
---|
| 1747 | extern int re_match ();
|
---|
| 1748 | extern reg_syntax_t re_set_syntax ();
|
---|
| 1749 | extern void re_set_registers ();
|
---|
| 1750 | extern __const__ char * re_compile_pattern ();
|
---|
| 1751 | extern int re_compile_fastmap ();
|
---|
| 1752 | extern char * re_comp ();
|
---|
| 1753 | extern int re_exec ();
|
---|
| 1754 | extern int regcomp ();
|
---|
| 1755 | extern int regexec ();
|
---|
| 1756 | extern size_t regerror ();
|
---|
| 1757 | extern void regfree ();
|
---|
| 1758 |
|
---|
| 1759 | #endif /* STDC */
|
---|
| 1760 |
|
---|
| 1761 | |
---|
| 1762 |
|
---|
| 1763 |
|
---|
| 1764 | #ifdef RX_WANT_RX_DEFS
|
---|
| 1765 |
|
---|
| 1766 | struct rx_counter_frame
|
---|
| 1767 | {
|
---|
| 1768 | int tag;
|
---|
| 1769 | int val;
|
---|
| 1770 | struct rx_counter_frame * inherited_from; /* If this is a copy. */
|
---|
| 1771 | struct rx_counter_frame * cdr;
|
---|
| 1772 | };
|
---|
| 1773 |
|
---|
| 1774 | struct rx_backtrack_frame
|
---|
| 1775 | {
|
---|
| 1776 | char * counter_stack_sp;
|
---|
| 1777 |
|
---|
| 1778 | /* A frame is used to save the matchers state when it crosses a
|
---|
| 1779 | * backtracking point. The `stk_' fields correspond to variables
|
---|
| 1780 | * in re_search_2 (just strip off thes `stk_'). They are documented
|
---|
| 1781 | * tere.
|
---|
| 1782 | */
|
---|
| 1783 | struct rx_superstate * stk_super;
|
---|
| 1784 | unsigned int stk_c;
|
---|
| 1785 | struct rx_string_position stk_test_pos;
|
---|
| 1786 | int stk_last_l;
|
---|
| 1787 | int stk_last_r;
|
---|
| 1788 | int stk_test_ret;
|
---|
| 1789 |
|
---|
| 1790 | /* This is the list of options left to explore at the backtrack
|
---|
| 1791 | * point for which this frame was created.
|
---|
| 1792 | */
|
---|
| 1793 | struct rx_distinct_future * df;
|
---|
| 1794 | struct rx_distinct_future * first_df;
|
---|
| 1795 |
|
---|
| 1796 | #ifdef RX_DEBUG
|
---|
| 1797 | int stk_line_no;
|
---|
| 1798 | #endif
|
---|
| 1799 | };
|
---|
| 1800 |
|
---|
| 1801 | struct rx_stack_chunk
|
---|
| 1802 | {
|
---|
| 1803 | struct rx_stack_chunk * next_chunk;
|
---|
| 1804 | int bytes_left;
|
---|
| 1805 | char * sp;
|
---|
| 1806 | };
|
---|
| 1807 |
|
---|
| 1808 | enum rx_outer_entry
|
---|
| 1809 | {
|
---|
| 1810 | rx_outer_start,
|
---|
| 1811 | rx_outer_fastmap,
|
---|
| 1812 | rx_outer_test,
|
---|
| 1813 | rx_outer_restore_pos
|
---|
| 1814 | };
|
---|
| 1815 |
|
---|
| 1816 | enum rx_fastmap_return
|
---|
| 1817 | {
|
---|
| 1818 | rx_fastmap_continuation,
|
---|
| 1819 | rx_fastmap_error,
|
---|
| 1820 | rx_fastmap_ok,
|
---|
| 1821 | rx_fastmap_fail
|
---|
| 1822 | };
|
---|
| 1823 |
|
---|
| 1824 | enum rx_fastmap_entry
|
---|
| 1825 | {
|
---|
| 1826 | rx_fastmap_start,
|
---|
| 1827 | rx_fastmap_string_break
|
---|
| 1828 | };
|
---|
| 1829 |
|
---|
| 1830 | enum rx_test_return
|
---|
| 1831 | {
|
---|
| 1832 | rx_test_continuation,
|
---|
| 1833 | rx_test_error,
|
---|
| 1834 | rx_test_fail,
|
---|
| 1835 | rx_test_ok
|
---|
| 1836 | };
|
---|
| 1837 |
|
---|
| 1838 | enum rx_test_internal_return
|
---|
| 1839 | {
|
---|
| 1840 | rx_test_internal_error,
|
---|
| 1841 | rx_test_found_first,
|
---|
| 1842 | rx_test_line_finished
|
---|
| 1843 | };
|
---|
| 1844 |
|
---|
| 1845 | enum rx_test_match_entry
|
---|
| 1846 | {
|
---|
| 1847 | rx_test_start,
|
---|
| 1848 | rx_test_cache_hit_loop,
|
---|
| 1849 | rx_test_backreference_check,
|
---|
| 1850 | rx_test_backtrack_return
|
---|
| 1851 | };
|
---|
| 1852 |
|
---|
| 1853 | struct rx_search_state
|
---|
| 1854 | {
|
---|
| 1855 | /* Two groups of registers are kept. The group with the register state
|
---|
| 1856 | * of the current test match, and the group that holds the state at the end
|
---|
| 1857 | * of the best known match, if any.
|
---|
| 1858 | *
|
---|
| 1859 | * For some patterns, there may also be registers saved on the stack.
|
---|
| 1860 | */
|
---|
| 1861 | unsigned num_regs; /* Includes an element for register zero. */
|
---|
| 1862 | regoff_t * lparen; /* scratch space for register returns */
|
---|
| 1863 | regoff_t * rparen;
|
---|
| 1864 | regoff_t * best_lpspace; /* in case the user doesn't want these */
|
---|
| 1865 | regoff_t * best_rpspace; /* values, we still need space to store
|
---|
| 1866 | * them. Normally, this memoryis unused
|
---|
| 1867 | * and the space pointed to by REGS is
|
---|
| 1868 | * used instead.
|
---|
| 1869 | */
|
---|
| 1870 |
|
---|
| 1871 | int last_l; /* Highest index of a valid lparen. */
|
---|
| 1872 | int last_r; /* It's dual. */
|
---|
| 1873 |
|
---|
| 1874 | int * best_lparen; /* This contains the best known register */
|
---|
| 1875 | int * best_rparen; /* assignments.
|
---|
| 1876 | * This may point to the same mem as
|
---|
| 1877 | * best_lpspace, or it might point to memory
|
---|
| 1878 | * passed by the caller.
|
---|
| 1879 | */
|
---|
| 1880 | int best_last_l; /* best_last_l:best_lparen::last_l:lparen */
|
---|
| 1881 | int best_last_r;
|
---|
| 1882 |
|
---|
| 1883 |
|
---|
| 1884 | unsigned char * translate;
|
---|
| 1885 |
|
---|
| 1886 | struct rx_string_position outer_pos;
|
---|
| 1887 |
|
---|
| 1888 | struct rx_superstate * start_super;
|
---|
| 1889 | int nfa_choice;
|
---|
| 1890 | int first_found; /* If true, return after finding any match. */
|
---|
| 1891 | int ret_val;
|
---|
| 1892 |
|
---|
| 1893 | /* For continuations... */
|
---|
| 1894 | enum rx_outer_entry outer_search_resume_pt;
|
---|
| 1895 | struct re_pattern_buffer * saved_rxb;
|
---|
| 1896 | int saved_startpos;
|
---|
| 1897 | int saved_range;
|
---|
| 1898 | int saved_stop;
|
---|
| 1899 | int saved_total_size;
|
---|
| 1900 | rx_get_burst_fn saved_get_burst;
|
---|
| 1901 | rx_back_check_fn saved_back_check;
|
---|
| 1902 | struct re_registers * saved_regs;
|
---|
| 1903 |
|
---|
| 1904 | /**
|
---|
| 1905 | ** state for fastmap
|
---|
| 1906 | **/
|
---|
| 1907 | char * fastmap;
|
---|
| 1908 | int fastmap_chr;
|
---|
| 1909 | int fastmap_val;
|
---|
| 1910 |
|
---|
| 1911 | /* for continuations in the fastmap procedure: */
|
---|
| 1912 | enum rx_fastmap_entry fastmap_resume_pt;
|
---|
| 1913 |
|
---|
| 1914 | /**
|
---|
| 1915 | ** state for test_match
|
---|
| 1916 | **/
|
---|
| 1917 |
|
---|
| 1918 | /* The current superNFA position of the matcher. */
|
---|
| 1919 | struct rx_superstate * super;
|
---|
| 1920 |
|
---|
| 1921 | /* The matcher interprets a series of instruction frames.
|
---|
| 1922 | * This is the `instruction counter' for the interpretation.
|
---|
| 1923 | */
|
---|
| 1924 | struct rx_inx * ifr;
|
---|
| 1925 |
|
---|
| 1926 | /* We insert a ghost character in the string to prime
|
---|
| 1927 | * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half
|
---|
| 1928 | * keep track of the test-match position and string-half.
|
---|
| 1929 | */
|
---|
| 1930 | unsigned char c;
|
---|
| 1931 |
|
---|
| 1932 | /* Position within the string. */
|
---|
| 1933 | struct rx_string_position test_pos;
|
---|
| 1934 |
|
---|
| 1935 | struct rx_stack_chunk * counter_stack;
|
---|
| 1936 | struct rx_stack_chunk * backtrack_stack;
|
---|
| 1937 | int backtrack_frame_bytes;
|
---|
| 1938 | int chunk_bytes;
|
---|
| 1939 | struct rx_stack_chunk * free_chunks;
|
---|
| 1940 |
|
---|
| 1941 | /* To return from this function, set test_ret and
|
---|
| 1942 | * `goto test_do_return'.
|
---|
| 1943 | *
|
---|
| 1944 | * Possible return values are:
|
---|
| 1945 | * 1 --- end of string while the superNFA is still going
|
---|
| 1946 | * 0 --- internal error (out of memory)
|
---|
| 1947 | * -1 --- search completed by reaching the superNFA fail state
|
---|
| 1948 | * -2 --- a match was found, maybe not the longest.
|
---|
| 1949 | *
|
---|
| 1950 | * When the search is complete (-1), best_last_r indicates whether
|
---|
| 1951 | * a match was found.
|
---|
| 1952 | *
|
---|
| 1953 | * -2 is return only if search_state.first_found is non-zero.
|
---|
| 1954 | *
|
---|
| 1955 | * if search_state.first_found is non-zero, a return of -1 indicates no match,
|
---|
| 1956 | * otherwise, best_last_r has to be checked.
|
---|
| 1957 | */
|
---|
| 1958 | int test_ret;
|
---|
| 1959 |
|
---|
| 1960 | int could_have_continued;
|
---|
| 1961 |
|
---|
| 1962 | #ifdef RX_DEBUG
|
---|
| 1963 | int backtrack_depth;
|
---|
| 1964 | /* There is a search tree with every node as set of deterministic
|
---|
| 1965 | * transitions in the super nfa. For every branch of a
|
---|
| 1966 | * backtrack point is an edge in the tree.
|
---|
| 1967 | * This counts up a pre-order of nodes in that tree.
|
---|
| 1968 | * It's saved on the search stack and printed when debugging.
|
---|
| 1969 | */
|
---|
| 1970 | int line_no;
|
---|
| 1971 | int lines_found;
|
---|
| 1972 | #endif
|
---|
| 1973 |
|
---|
| 1974 |
|
---|
| 1975 | /* For continuations within the match tester */
|
---|
| 1976 | enum rx_test_match_entry test_match_resume_pt;
|
---|
| 1977 | struct rx_inx * saved_next_tr_table;
|
---|
| 1978 | struct rx_inx * saved_this_tr_table;
|
---|
| 1979 | int saved_reg;
|
---|
| 1980 | struct rx_backtrack_frame * saved_bf;
|
---|
| 1981 |
|
---|
| 1982 | };
|
---|
| 1983 |
|
---|
| 1984 | |
---|
| 1985 |
|
---|
| 1986 | extern char rx_slowmap[];
|
---|
| 1987 | extern unsigned char rx_id_translation[];
|
---|
| 1988 |
|
---|
| 1989 | static __inline__ void
|
---|
| 1990 | init_fastmap (rxb, search_state)
|
---|
| 1991 | struct re_pattern_buffer * rxb;
|
---|
| 1992 | struct rx_search_state * search_state;
|
---|
| 1993 | {
|
---|
| 1994 | search_state->fastmap = (rxb->fastmap
|
---|
| 1995 | ? (char *)rxb->fastmap
|
---|
| 1996 | : (char *)rx_slowmap);
|
---|
| 1997 | /* Update the fastmap now if not correct already.
|
---|
| 1998 | * When the regexp was compiled, the fastmap was computed
|
---|
| 1999 | * and stored in a bitset. This expands the bitset into a
|
---|
| 2000 | * character array containing 1s and 0s.
|
---|
| 2001 | */
|
---|
| 2002 | if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate)
|
---|
| 2003 | rx_blow_up_fastmap (rxb);
|
---|
| 2004 | search_state->fastmap_chr = -1;
|
---|
| 2005 | search_state->fastmap_val = 0;
|
---|
| 2006 | search_state->fastmap_resume_pt = rx_fastmap_start;
|
---|
| 2007 | }
|
---|
| 2008 |
|
---|
| 2009 | static __inline__ void
|
---|
| 2010 | uninit_fastmap (rxb, search_state)
|
---|
| 2011 | struct re_pattern_buffer * rxb;
|
---|
| 2012 | struct rx_search_state * search_state;
|
---|
| 2013 | {
|
---|
| 2014 | /* Unset the fastmap sentinel */
|
---|
| 2015 | if (search_state->fastmap_chr >= 0)
|
---|
| 2016 | search_state->fastmap[search_state->fastmap_chr]
|
---|
| 2017 | = search_state->fastmap_val;
|
---|
| 2018 | }
|
---|
| 2019 |
|
---|
| 2020 | static __inline__ int
|
---|
| 2021 | fastmap_search (rxb, stop, get_burst, app_closure, search_state)
|
---|
| 2022 | struct re_pattern_buffer * rxb;
|
---|
| 2023 | int stop;
|
---|
| 2024 | rx_get_burst_fn get_burst;
|
---|
| 2025 | void * app_closure;
|
---|
| 2026 | struct rx_search_state * search_state;
|
---|
| 2027 | {
|
---|
| 2028 | enum rx_fastmap_entry pc;
|
---|
| 2029 |
|
---|
| 2030 | if (0)
|
---|
| 2031 | {
|
---|
| 2032 | return_continuation:
|
---|
| 2033 | search_state->fastmap_resume_pt = pc;
|
---|
| 2034 | return rx_fastmap_continuation;
|
---|
| 2035 | }
|
---|
| 2036 |
|
---|
| 2037 | pc = search_state->fastmap_resume_pt;
|
---|
| 2038 |
|
---|
| 2039 | switch (pc)
|
---|
| 2040 | {
|
---|
| 2041 | default:
|
---|
| 2042 | return rx_fastmap_error;
|
---|
| 2043 | case rx_fastmap_start:
|
---|
| 2044 | init_fastmap_sentinal:
|
---|
| 2045 | /* For the sake of fast fastmapping, set a sentinal in the fastmap.
|
---|
| 2046 | * This sentinal will trap the fastmap loop when it reaches the last
|
---|
| 2047 | * valid character in a string half.
|
---|
| 2048 | *
|
---|
| 2049 | * This must be reset when the fastmap/search loop crosses a string
|
---|
| 2050 | * boundry, and before returning to the caller. So sometimes,
|
---|
| 2051 | * the fastmap loop is restarted with `continue', othertimes by
|
---|
| 2052 | * `goto init_fastmap_sentinal'.
|
---|
| 2053 | */
|
---|
| 2054 | if (search_state->outer_pos.size)
|
---|
| 2055 | {
|
---|
| 2056 | search_state->fastmap_chr = ((search_state->outer_pos.search_direction == 1)
|
---|
| 2057 | ? *(search_state->outer_pos.end - 1)
|
---|
| 2058 | : *search_state->outer_pos.string);
|
---|
| 2059 | search_state->fastmap_val
|
---|
| 2060 | = search_state->fastmap[search_state->fastmap_chr];
|
---|
| 2061 | search_state->fastmap[search_state->fastmap_chr] = 1;
|
---|
| 2062 | }
|
---|
| 2063 | else
|
---|
| 2064 | {
|
---|
| 2065 | search_state->fastmap_chr = -1;
|
---|
| 2066 | search_state->fastmap_val = 0;
|
---|
| 2067 | }
|
---|
| 2068 |
|
---|
| 2069 | if (search_state->outer_pos.pos >= search_state->outer_pos.end)
|
---|
| 2070 | goto fastmap_hit_bound;
|
---|
| 2071 | else
|
---|
| 2072 | {
|
---|
| 2073 | if (search_state->outer_pos.search_direction == 1)
|
---|
| 2074 | {
|
---|
| 2075 | if (search_state->fastmap_val)
|
---|
| 2076 | {
|
---|
| 2077 | for (;;)
|
---|
| 2078 | {
|
---|
| 2079 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
| 2080 | ++search_state->outer_pos.pos;
|
---|
| 2081 | return rx_fastmap_ok;
|
---|
| 2082 | }
|
---|
| 2083 | }
|
---|
| 2084 | else
|
---|
| 2085 | {
|
---|
| 2086 | for (;;)
|
---|
| 2087 | {
|
---|
| 2088 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
| 2089 | ++search_state->outer_pos.pos;
|
---|
| 2090 | if (*search_state->outer_pos.pos != search_state->fastmap_chr)
|
---|
| 2091 | return rx_fastmap_ok;
|
---|
| 2092 | else
|
---|
| 2093 | {
|
---|
| 2094 | ++search_state->outer_pos.pos;
|
---|
| 2095 | if (search_state->outer_pos.pos == search_state->outer_pos.end)
|
---|
| 2096 | goto fastmap_hit_bound;
|
---|
| 2097 | }
|
---|
| 2098 | }
|
---|
| 2099 | }
|
---|
| 2100 | }
|
---|
| 2101 | else
|
---|
| 2102 | {
|
---|
| 2103 | __const__ unsigned char * bound;
|
---|
| 2104 | bound = search_state->outer_pos.string - 1;
|
---|
| 2105 | if (search_state->fastmap_val)
|
---|
| 2106 | {
|
---|
| 2107 | for (;;)
|
---|
| 2108 | {
|
---|
| 2109 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
| 2110 | --search_state->outer_pos.pos;
|
---|
| 2111 | return rx_fastmap_ok;
|
---|
| 2112 | }
|
---|
| 2113 | }
|
---|
| 2114 | else
|
---|
| 2115 | {
|
---|
| 2116 | for (;;)
|
---|
| 2117 | {
|
---|
| 2118 | while (!search_state->fastmap[*search_state->outer_pos.pos])
|
---|
| 2119 | --search_state->outer_pos.pos;
|
---|
| 2120 | if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val)
|
---|
| 2121 | return rx_fastmap_ok;
|
---|
| 2122 | else
|
---|
| 2123 | {
|
---|
| 2124 | --search_state->outer_pos.pos;
|
---|
| 2125 | if (search_state->outer_pos.pos == bound)
|
---|
| 2126 | goto fastmap_hit_bound;
|
---|
| 2127 | }
|
---|
| 2128 | }
|
---|
| 2129 | }
|
---|
| 2130 | }
|
---|
| 2131 | }
|
---|
| 2132 |
|
---|
| 2133 | case rx_fastmap_string_break:
|
---|
| 2134 | fastmap_hit_bound:
|
---|
| 2135 | {
|
---|
| 2136 | /* If we hit a bound, it may be time to fetch another burst
|
---|
| 2137 | * of string, or it may be time to return a continuation to
|
---|
| 2138 | * the caller, or it might be time to fail.
|
---|
| 2139 | */
|
---|
| 2140 |
|
---|
| 2141 | int burst_state;
|
---|
| 2142 | burst_state = get_burst (&search_state->outer_pos, app_closure, stop);
|
---|
| 2143 | switch (burst_state)
|
---|
| 2144 | {
|
---|
| 2145 | default:
|
---|
| 2146 | case rx_get_burst_error:
|
---|
| 2147 | return rx_fastmap_error;
|
---|
| 2148 | case rx_get_burst_continuation:
|
---|
| 2149 | {
|
---|
| 2150 | pc = rx_fastmap_string_break;
|
---|
| 2151 | goto return_continuation;
|
---|
| 2152 | }
|
---|
| 2153 | case rx_get_burst_ok:
|
---|
| 2154 | goto init_fastmap_sentinal;
|
---|
| 2155 | case rx_get_burst_no_more:
|
---|
| 2156 | /* ...not a string split, simply no more string.
|
---|
| 2157 | *
|
---|
| 2158 | * When searching backward, running out of string
|
---|
| 2159 | * is reason to quit.
|
---|
| 2160 | *
|
---|
| 2161 | * When searching forward, we allow the possibility
|
---|
| 2162 | * of an (empty) match after the last character in the
|
---|
| 2163 | * virtual string. So, fall through to the matcher
|
---|
| 2164 | */
|
---|
| 2165 | return ( (search_state->outer_pos.search_direction == 1)
|
---|
| 2166 | ? rx_fastmap_ok
|
---|
| 2167 | : rx_fastmap_fail);
|
---|
| 2168 | }
|
---|
| 2169 | }
|
---|
| 2170 | }
|
---|
| 2171 |
|
---|
| 2172 | }
|
---|
| 2173 |
|
---|
| 2174 | |
---|
| 2175 |
|
---|
| 2176 |
|
---|
| 2177 | #ifdef emacs
|
---|
| 2178 | /* The `emacs' switch turns on certain matching commands
|
---|
| 2179 | * that make sense only in Emacs.
|
---|
| 2180 | */
|
---|
| 2181 | #include "sysfuncs.h"
|
---|
| 2182 | #include "lisp.h"
|
---|
| 2183 | #include "buffer.h"
|
---|
| 2184 | #include "syntax.h"
|
---|
| 2185 | #endif /* emacs */
|
---|
| 2186 |
|
---|
| 2187 | /* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar
|
---|
| 2188 | * packages too.
|
---|
| 2189 | */
|
---|
| 2190 | #ifdef RX_MEMDBUG
|
---|
| 2191 | #include <malloc.h>
|
---|
| 2192 | #endif /* RX_RX_MEMDBUG */
|
---|
| 2193 |
|
---|
| 2194 | /* We used to test for `BSTRING' here, but only GCC and Emacs define
|
---|
| 2195 | * `BSTRING', as far as I know, and neither of them use this code.
|
---|
| 2196 | */
|
---|
| 2197 | #if HAVE_STRING_H || STDC_HEADERS
|
---|
| 2198 | #include <string.h>
|
---|
| 2199 |
|
---|
| 2200 | #ifndef bcmp
|
---|
| 2201 | #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
|
---|
| 2202 | #endif
|
---|
| 2203 |
|
---|
| 2204 | #ifndef bcopy
|
---|
| 2205 | #define bcopy(s, d, n) memcpy ((d), (s), (n))
|
---|
| 2206 | #endif
|
---|
| 2207 |
|
---|
| 2208 | #ifndef bzero
|
---|
| 2209 | #define bzero(s, n) memset ((s), 0, (n))
|
---|
| 2210 | #endif
|
---|
| 2211 |
|
---|
| 2212 | #else /* HAVE_STRING_H || STDC_HEADERS */
|
---|
| 2213 | #include <strings.h>
|
---|
| 2214 | #endif /* not (HAVE_STRING_H || STDC_HEADERS) */
|
---|
| 2215 |
|
---|
| 2216 | #ifdef STDC_HEADERS
|
---|
| 2217 | #include <stdlib.h>
|
---|
| 2218 | #else /* not STDC_HEADERS */
|
---|
| 2219 | char *malloc ();
|
---|
| 2220 | char *realloc ();
|
---|
| 2221 | #endif /* not STDC_HEADERS */
|
---|
| 2222 |
|
---|
| 2223 | |
---|
| 2224 |
|
---|
| 2225 |
|
---|
| 2226 | /* How many characters in the character set. */
|
---|
| 2227 | #define CHAR_SET_SIZE (1 << CHARBITS)
|
---|
| 2228 |
|
---|
| 2229 | #ifndef emacs
|
---|
| 2230 | /* Define the syntax basics for \<, \>, etc.
|
---|
| 2231 | * This must be nonzero for the wordchar and notwordchar pattern
|
---|
| 2232 | * commands in re_match_2.
|
---|
| 2233 | */
|
---|
| 2234 | #ifndef Sword
|
---|
| 2235 | #define Sword 1
|
---|
| 2236 | #endif
|
---|
| 2237 | #define SYNTAX(c) re_syntax_table[c]
|
---|
| 2238 | #ifdef SYNTAX_TABLE
|
---|
| 2239 | extern char *re_syntax_table;
|
---|
| 2240 | #else
|
---|
| 2241 | RX_DECL char re_syntax_table[CHAR_SET_SIZE];
|
---|
| 2242 | #endif
|
---|
| 2243 | #endif /* not emacs */
|
---|
| 2244 |
|
---|
| 2245 |
|
---|
| 2246 | /* Test if at very beginning or at very end of the virtual concatenation
|
---|
| 2247 | * of `string1' and `string2'. If only one string, it's `string2'.
|
---|
| 2248 | */
|
---|
| 2249 |
|
---|
| 2250 | #define AT_STRINGS_BEG() \
|
---|
| 2251 | ( -1 \
|
---|
| 2252 | == ((search_state.test_pos.pos - search_state.test_pos.string) \
|
---|
| 2253 | + search_state.test_pos.offset))
|
---|
| 2254 |
|
---|
| 2255 | #define AT_STRINGS_END() \
|
---|
| 2256 | ( (total_size - 1) \
|
---|
| 2257 | == ((search_state.test_pos.pos - search_state.test_pos.string) \
|
---|
| 2258 | + search_state.test_pos.offset))
|
---|
| 2259 |
|
---|
| 2260 |
|
---|
| 2261 | /* Test if POS + 1 points to a character which is word-constituent. We have
|
---|
| 2262 | * two special cases to check for: if past the end of string1, look at
|
---|
| 2263 | * the first character in string2; and if before the beginning of
|
---|
| 2264 | * string2, look at the last character in string1.
|
---|
| 2265 | *
|
---|
| 2266 | * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG ().
|
---|
| 2267 | */
|
---|
| 2268 | #define LETTER_P(POS,OFF) \
|
---|
| 2269 | ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \
|
---|
| 2270 | == Sword)
|
---|
| 2271 |
|
---|
| 2272 | /* Test if the character at D and the one after D differ with respect
|
---|
| 2273 | * to being word-constituent.
|
---|
| 2274 | */
|
---|
| 2275 | #define AT_WORD_BOUNDARY(d) \
|
---|
| 2276 | (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1))
|
---|
| 2277 |
|
---|
| 2278 |
|
---|
| 2279 | #ifdef RX_SUPPORT_CONTINUATIONS
|
---|
| 2280 | #define RX_STACK_ALLOC(BYTES) malloc(BYTES)
|
---|
| 2281 | #define RX_STACK_FREE(MEM) free(MEM)
|
---|
| 2282 | #else
|
---|
| 2283 | #define RX_STACK_ALLOC(BYTES) alloca(BYTES)
|
---|
| 2284 | #define RX_STACK_FREE(MEM) \
|
---|
| 2285 | ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \
|
---|
| 2286 | search_state.free_chunks = ((struct rx_stack_chunk *)MEM);
|
---|
| 2287 |
|
---|
| 2288 | #endif
|
---|
| 2289 |
|
---|
| 2290 | #define PUSH(CHUNK_VAR,BYTES) \
|
---|
| 2291 | if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \
|
---|
| 2292 | { \
|
---|
| 2293 | struct rx_stack_chunk * new_chunk; \
|
---|
| 2294 | if (search_state.free_chunks) \
|
---|
| 2295 | { \
|
---|
| 2296 | new_chunk = search_state.free_chunks; \
|
---|
| 2297 | search_state.free_chunks = search_state.free_chunks->next_chunk; \
|
---|
| 2298 | } \
|
---|
| 2299 | else \
|
---|
| 2300 | { \
|
---|
| 2301 | new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \
|
---|
| 2302 | if (!new_chunk) \
|
---|
| 2303 | { \
|
---|
| 2304 | search_state.ret_val = 0; \
|
---|
| 2305 | goto test_do_return; \
|
---|
| 2306 | } \
|
---|
| 2307 | } \
|
---|
| 2308 | new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \
|
---|
| 2309 | new_chunk->bytes_left = (search_state.chunk_bytes \
|
---|
| 2310 | - (BYTES) \
|
---|
| 2311 | - sizeof (struct rx_stack_chunk)); \
|
---|
| 2312 | new_chunk->next_chunk = CHUNK_VAR; \
|
---|
| 2313 | CHUNK_VAR = new_chunk; \
|
---|
| 2314 | } \
|
---|
| 2315 | else \
|
---|
| 2316 | (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES))
|
---|
| 2317 |
|
---|
[23508] | 2318 | #define POP(CHUNK_VAR,BYTES) \
|
---|
[3745] | 2319 | if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \
|
---|
| 2320 | { \
|
---|
| 2321 | struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \
|
---|
| 2322 | RX_STACK_FREE(CHUNK_VAR); \
|
---|
| 2323 | CHUNK_VAR = new_chunk; \
|
---|
| 2324 | } \
|
---|
| 2325 | else \
|
---|
| 2326 | (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES)
|
---|
| 2327 |
|
---|
| 2328 |
|
---|
| 2329 |
|
---|
| 2330 | #define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)]
|
---|
| 2331 |
|
---|
| 2332 |
|
---|
| 2333 | |
---|
| 2334 |
|
---|
[23508] | 2335 |
|
---|
[3745] | 2336 | #ifdef __STDC__
|
---|
| 2337 | RX_DECL __inline__ int
|
---|
| 2338 | rx_search (struct re_pattern_buffer * rxb,
|
---|
| 2339 | intptr_t startpos,
|
---|
| 2340 | int range,
|
---|
| 2341 | int stop,
|
---|
| 2342 | int total_size,
|
---|
| 2343 | rx_get_burst_fn get_burst,
|
---|
| 2344 | rx_back_check_fn back_check,
|
---|
| 2345 | rx_fetch_char_fn fetch_char,
|
---|
| 2346 | void * app_closure,
|
---|
| 2347 | struct re_registers * regs,
|
---|
| 2348 | struct rx_search_state * resume_state,
|
---|
| 2349 | struct rx_search_state * save_state)
|
---|
| 2350 | #else
|
---|
| 2351 | RX_DECL __inline__ int
|
---|
| 2352 | rx_search (rxb, startpos, range, stop, total_size,
|
---|
| 2353 | get_burst, back_check, fetch_char,
|
---|
| 2354 | app_closure, regs, resume_state, save_state)
|
---|
| 2355 | struct re_pattern_buffer * rxb;
|
---|
| 2356 | intptr_t startpos;
|
---|
| 2357 | int range;
|
---|
| 2358 | int stop;
|
---|
| 2359 | int total_size;
|
---|
| 2360 | rx_get_burst_fn get_burst;
|
---|
| 2361 | rx_back_check_fn back_check;
|
---|
| 2362 | rx_fetch_char_fn fetch_char;
|
---|
| 2363 | void * app_closure;
|
---|
| 2364 | struct re_registers * regs;
|
---|
| 2365 | struct rx_search_state * resume_state;
|
---|
| 2366 | struct rx_search_state * save_state;
|
---|
| 2367 | #endif
|
---|
| 2368 | {
|
---|
| 2369 | int pc;
|
---|
| 2370 | int test_state;
|
---|
| 2371 | struct rx_search_state search_state;
|
---|
| 2372 |
|
---|
| 2373 | search_state.free_chunks = 0;
|
---|
| 2374 | if (!resume_state)
|
---|
| 2375 | pc = rx_outer_start;
|
---|
| 2376 | else
|
---|
| 2377 | {
|
---|
| 2378 | search_state = *resume_state;
|
---|
| 2379 | regs = search_state.saved_regs;
|
---|
| 2380 | rxb = search_state.saved_rxb;
|
---|
| 2381 | startpos = search_state.saved_startpos;
|
---|
| 2382 | range = search_state.saved_range;
|
---|
| 2383 | stop = search_state.saved_stop;
|
---|
| 2384 | total_size = search_state.saved_total_size;
|
---|
| 2385 | get_burst = search_state.saved_get_burst;
|
---|
| 2386 | back_check = search_state.saved_back_check;
|
---|
| 2387 | pc = search_state.outer_search_resume_pt;
|
---|
| 2388 | if (0)
|
---|
| 2389 | {
|
---|
| 2390 | return_continuation:
|
---|
| 2391 | if (save_state)
|
---|
| 2392 | {
|
---|
| 2393 | *save_state = search_state;
|
---|
| 2394 | save_state->saved_regs = regs;
|
---|
| 2395 | save_state->saved_rxb = rxb;
|
---|
| 2396 | save_state->saved_startpos = startpos;
|
---|
| 2397 | save_state->saved_range = range;
|
---|
| 2398 | save_state->saved_stop = stop;
|
---|
| 2399 | save_state->saved_total_size = total_size;
|
---|
| 2400 | save_state->saved_get_burst = get_burst;
|
---|
| 2401 | save_state->saved_back_check = back_check;
|
---|
| 2402 | save_state->outer_search_resume_pt = pc;
|
---|
| 2403 | }
|
---|
| 2404 | return rx_search_continuation;
|
---|
| 2405 | }
|
---|
| 2406 | }
|
---|
| 2407 |
|
---|
| 2408 | switch (pc)
|
---|
| 2409 | {
|
---|
| 2410 | case rx_outer_start:
|
---|
| 2411 | search_state.ret_val = rx_search_fail;
|
---|
| 2412 | ( search_state.lparen
|
---|
| 2413 | = search_state.rparen
|
---|
| 2414 | = search_state.best_lpspace
|
---|
| 2415 | = search_state.best_rpspace
|
---|
| 2416 | = 0);
|
---|
| 2417 |
|
---|
| 2418 | /* figure the number of registers we may need for use in backreferences.
|
---|
| 2419 | * the number here includes an element for register zero.
|
---|
| 2420 | */
|
---|
| 2421 | search_state.num_regs = rxb->re_nsub + 1;
|
---|
| 2422 |
|
---|
| 2423 |
|
---|
| 2424 | /* check for out-of-range startpos. */
|
---|
| 2425 | if ((startpos < 0) || (startpos > total_size))
|
---|
| 2426 | return rx_search_fail;
|
---|
| 2427 |
|
---|
| 2428 | /* fix up range if it might eventually take us outside the string. */
|
---|
| 2429 | {
|
---|
| 2430 | int endpos;
|
---|
| 2431 | endpos = startpos + range;
|
---|
| 2432 | if (endpos < -1)
|
---|
| 2433 | range = (-1 - startpos);
|
---|
| 2434 | else if (endpos > (total_size + 1))
|
---|
| 2435 | range = total_size - startpos;
|
---|
| 2436 | }
|
---|
| 2437 |
|
---|
| 2438 | /* if the search isn't to be a backwards one, don't waste time in a
|
---|
| 2439 | * long search for a pattern that says it is anchored.
|
---|
| 2440 | */
|
---|
| 2441 | if (rxb->begbuf_only && (range > 0))
|
---|
| 2442 | {
|
---|
| 2443 | if (startpos > 0)
|
---|
| 2444 | return rx_search_fail;
|
---|
| 2445 | else
|
---|
| 2446 | range = 1;
|
---|
| 2447 | }
|
---|
| 2448 |
|
---|
| 2449 | /* decide whether to use internal or user-provided reg buffers. */
|
---|
| 2450 | if (!regs || rxb->no_sub)
|
---|
| 2451 | {
|
---|
| 2452 | search_state.best_lpspace =
|
---|
| 2453 | (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
| 2454 | search_state.best_rpspace =
|
---|
| 2455 | (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
| 2456 | search_state.best_lparen = search_state.best_lpspace;
|
---|
| 2457 | search_state.best_rparen = search_state.best_rpspace;
|
---|
| 2458 | }
|
---|
| 2459 | else
|
---|
| 2460 | {
|
---|
| 2461 | /* have the register data arrays been allocated? */
|
---|
| 2462 | if (rxb->regs_allocated == REGS_UNALLOCATED)
|
---|
| 2463 | { /* no. so allocate them with malloc. we need one
|
---|
| 2464 | extra element beyond `search_state.num_regs' for the `-1' marker
|
---|
| 2465 | gnu code uses. */
|
---|
| 2466 | regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1);
|
---|
| 2467 | regs->start = ((regoff_t *)
|
---|
| 2468 | malloc (regs->num_regs * sizeof ( regoff_t)));
|
---|
| 2469 | regs->end = ((regoff_t *)
|
---|
| 2470 | malloc (regs->num_regs * sizeof ( regoff_t)));
|
---|
| 2471 | if (regs->start == 0 || regs->end == 0)
|
---|
| 2472 | return rx_search_error;
|
---|
| 2473 | rxb->regs_allocated = REGS_REALLOCATE;
|
---|
| 2474 | }
|
---|
| 2475 | else if (rxb->regs_allocated == REGS_REALLOCATE)
|
---|
| 2476 | { /* yes. if we need more elements than were already
|
---|
| 2477 | allocated, reallocate them. if we need fewer, just
|
---|
| 2478 | leave it alone. */
|
---|
| 2479 | if (regs->num_regs < search_state.num_regs + 1)
|
---|
| 2480 | {
|
---|
| 2481 | regs->num_regs = search_state.num_regs + 1;
|
---|
| 2482 | regs->start = ((regoff_t *)
|
---|
| 2483 | realloc (regs->start,
|
---|
| 2484 | regs->num_regs * sizeof (regoff_t)));
|
---|
| 2485 | regs->end = ((regoff_t *)
|
---|
| 2486 | realloc (regs->end,
|
---|
| 2487 | regs->num_regs * sizeof ( regoff_t)));
|
---|
| 2488 | if (regs->start == 0 || regs->end == 0)
|
---|
| 2489 | return rx_search_error;
|
---|
| 2490 | }
|
---|
| 2491 | }
|
---|
| 2492 | else if (rxb->regs_allocated != REGS_FIXED)
|
---|
| 2493 | return rx_search_error;
|
---|
| 2494 |
|
---|
| 2495 | if (regs->num_regs < search_state.num_regs + 1)
|
---|
| 2496 | {
|
---|
| 2497 | search_state.best_lpspace =
|
---|
| 2498 | ((regoff_t *)
|
---|
| 2499 | REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
|
---|
| 2500 | search_state.best_rpspace =
|
---|
| 2501 | ((regoff_t *)
|
---|
| 2502 | REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
|
---|
| 2503 | search_state.best_lparen = search_state.best_lpspace;
|
---|
| 2504 | search_state.best_rparen = search_state.best_rpspace;
|
---|
| 2505 | }
|
---|
| 2506 | else
|
---|
| 2507 | {
|
---|
| 2508 | search_state.best_lparen = regs->start;
|
---|
| 2509 | search_state.best_rparen = regs->end;
|
---|
| 2510 | }
|
---|
| 2511 | }
|
---|
| 2512 |
|
---|
| 2513 | search_state.lparen =
|
---|
| 2514 | (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
| 2515 | search_state.rparen =
|
---|
| 2516 | (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
|
---|
| 2517 |
|
---|
| 2518 | if (! ( search_state.best_rparen
|
---|
| 2519 | && search_state.best_lparen
|
---|
| 2520 | && search_state.lparen && search_state.rparen))
|
---|
| 2521 | return rx_search_error;
|
---|
| 2522 |
|
---|
| 2523 | search_state.best_last_l = search_state.best_last_r = -1;
|
---|
| 2524 |
|
---|
| 2525 | search_state.translate = (rxb->translate
|
---|
| 2526 | ? rxb->translate
|
---|
| 2527 | : rx_id_translation);
|
---|
| 2528 |
|
---|
| 2529 |
|
---|
| 2530 |
|
---|
| 2531 | /*
|
---|
| 2532 | * two nfa's were compiled.
|
---|
| 2533 | * `0' is complete.
|
---|
| 2534 | * `1' faster but gets registers wrong and ends too soon.
|
---|
| 2535 | */
|
---|
| 2536 | search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1';
|
---|
| 2537 |
|
---|
| 2538 | /* we have the option to look for the best match or the first
|
---|
| 2539 | * one we can find. if the user isn't asking for register information,
|
---|
| 2540 | * we don't need to find the best match.
|
---|
| 2541 | */
|
---|
| 2542 | search_state.first_found = !regs;
|
---|
| 2543 |
|
---|
| 2544 | if (range >= 0)
|
---|
| 2545 | {
|
---|
| 2546 | search_state.outer_pos.search_end = startpos + range;
|
---|
| 2547 | search_state.outer_pos.search_direction = 1;
|
---|
| 2548 | }
|
---|
| 2549 | else
|
---|
| 2550 | {
|
---|
| 2551 | search_state.outer_pos.search_end = startpos + range;
|
---|
| 2552 | search_state.outer_pos.search_direction = -1;
|
---|
| 2553 | }
|
---|
| 2554 |
|
---|
| 2555 | /* the vacuous search always turns up nothing. */
|
---|
| 2556 | if ((search_state.outer_pos.search_direction == 1)
|
---|
| 2557 | ? (startpos > search_state.outer_pos.search_end)
|
---|
| 2558 | : (startpos < search_state.outer_pos.search_end))
|
---|
| 2559 | return rx_search_fail;
|
---|
| 2560 |
|
---|
| 2561 | /* now we build the starting state of the supernfa. */
|
---|
| 2562 | {
|
---|
| 2563 | struct rx_superset * start_contents;
|
---|
| 2564 | struct rx_nfa_state_set * start_nfa_set;
|
---|
| 2565 |
|
---|
| 2566 | /* we presume here that the nfa start state has only one
|
---|
| 2567 | * possible future with no side effects.
|
---|
| 2568 | */
|
---|
| 2569 | start_nfa_set = rxb->start->futures->destset;
|
---|
| 2570 | if ( rxb->rx.start_set
|
---|
| 2571 | && (rxb->rx.start_set->starts_for == &rxb->rx))
|
---|
| 2572 | start_contents = rxb->rx.start_set;
|
---|
| 2573 | else
|
---|
| 2574 | {
|
---|
| 2575 | start_contents =
|
---|
| 2576 | rx_superstate_eclosure_union (&rxb->rx,
|
---|
| 2577 | rx_superset_cons (&rxb->rx, 0, 0),
|
---|
| 2578 | start_nfa_set);
|
---|
| 2579 |
|
---|
| 2580 | if (!start_contents)
|
---|
| 2581 | return rx_search_fail;
|
---|
| 2582 |
|
---|
| 2583 | start_contents->starts_for = &rxb->rx;
|
---|
| 2584 | rxb->rx.start_set = start_contents;
|
---|
| 2585 | }
|
---|
| 2586 | if ( start_contents->superstate
|
---|
| 2587 | && (start_contents->superstate->rx_id == rxb->rx.rx_id))
|
---|
| 2588 | {
|
---|
| 2589 | search_state.start_super = start_contents->superstate;
|
---|
| 2590 | rx_lock_superstate (&rxb->rx, search_state.start_super);
|
---|
| 2591 | }
|
---|
| 2592 | else
|
---|
| 2593 | {
|
---|
| 2594 | rx_protect_superset (&rxb->rx, start_contents);
|
---|
| 2595 |
|
---|
| 2596 | search_state.start_super = rx_superstate (&rxb->rx, start_contents);
|
---|
| 2597 | if (!search_state.start_super)
|
---|
| 2598 | return rx_search_fail;
|
---|
| 2599 | rx_lock_superstate (&rxb->rx, search_state.start_super);
|
---|
| 2600 | rx_release_superset (&rxb->rx, start_contents);
|
---|
| 2601 | }
|
---|
| 2602 | }
|
---|
| 2603 |
|
---|
| 2604 |
|
---|
| 2605 | /* The outer_pos tracks the position within the strings
|
---|
| 2606 | * as seen by loop that calls fastmap_search.
|
---|
| 2607 | *
|
---|
| 2608 | * The caller supplied get_burst function actually
|
---|
| 2609 | * gives us pointers to chars.
|
---|
| 2610 | *
|
---|
| 2611 | * Communication with the get_burst function is through an
|
---|
| 2612 | * rx_string_position structure. Here, the structure for
|
---|
| 2613 | * outer_pos is initialized. It is set to point to the
|
---|
| 2614 | * NULL string, at an offset of STARTPOS. STARTPOS is out
|
---|
| 2615 | * of range of the NULL string, so the first call to
|
---|
| 2616 | * getburst will patch up the rx_string_position to point
|
---|
| 2617 | * to valid characters.
|
---|
| 2618 | */
|
---|
| 2619 |
|
---|
| 2620 | ( search_state.outer_pos.string
|
---|
| 2621 | = search_state.outer_pos.end
|
---|
| 2622 | = 0);
|
---|
| 2623 |
|
---|
| 2624 | search_state.outer_pos.offset = 0;
|
---|
| 2625 | search_state.outer_pos.size = 0;
|
---|
| 2626 | search_state.outer_pos.pos = (unsigned char *)startpos;
|
---|
| 2627 | init_fastmap (rxb, &search_state);
|
---|
| 2628 |
|
---|
| 2629 | search_state.fastmap_resume_pt = rx_fastmap_start;
|
---|
| 2630 | case rx_outer_fastmap:
|
---|
| 2631 | /* do { */
|
---|
| 2632 | pseudo_do:
|
---|
| 2633 | {
|
---|
| 2634 | {
|
---|
| 2635 | int fastmap_state;
|
---|
| 2636 | fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure,
|
---|
| 2637 | &search_state);
|
---|
| 2638 | switch (fastmap_state)
|
---|
| 2639 | {
|
---|
| 2640 | case rx_fastmap_continuation:
|
---|
| 2641 | pc = rx_outer_fastmap;
|
---|
| 2642 | goto return_continuation;
|
---|
| 2643 | case rx_fastmap_fail:
|
---|
| 2644 | goto finish;
|
---|
| 2645 | case rx_fastmap_ok:
|
---|
| 2646 | break;
|
---|
| 2647 | }
|
---|
| 2648 | }
|
---|
| 2649 |
|
---|
| 2650 | /* now the fastmap loop has brought us to a plausible
|
---|
| 2651 | * starting point for a match. so, it's time to run the
|
---|
| 2652 | * nfa and see if a match occured.
|
---|
| 2653 | */
|
---|
| 2654 | startpos = ( search_state.outer_pos.pos
|
---|
| 2655 | - search_state.outer_pos.string
|
---|
| 2656 | + search_state.outer_pos.offset);
|
---|
| 2657 | /*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end))
|
---|
| 2658 | /*|*/ goto finish;
|
---|
| 2659 | }
|
---|
| 2660 |
|
---|
| 2661 | search_state.test_match_resume_pt = rx_test_start;
|
---|
| 2662 | /* do interrupted for entry point... */
|
---|
| 2663 | case rx_outer_test:
|
---|
| 2664 | /* ...do continued */
|
---|
| 2665 | {
|
---|
| 2666 | goto test_match;
|
---|
| 2667 | test_returns_to_search:
|
---|
| 2668 | switch (test_state)
|
---|
| 2669 | {
|
---|
| 2670 | case rx_test_continuation:
|
---|
| 2671 | pc = rx_outer_test;
|
---|
| 2672 | goto return_continuation;
|
---|
| 2673 | case rx_test_error:
|
---|
| 2674 | search_state.ret_val = rx_search_error;
|
---|
| 2675 | goto finish;
|
---|
| 2676 | case rx_test_fail:
|
---|
| 2677 | break;
|
---|
| 2678 | case rx_test_ok:
|
---|
| 2679 | goto finish;
|
---|
| 2680 | }
|
---|
| 2681 | search_state.outer_pos.pos += search_state.outer_pos.search_direction;
|
---|
| 2682 | startpos += search_state.outer_pos.search_direction;
|
---|
| 2683 | #if 0
|
---|
| 2684 | /*|*/ if (search_state.test_pos.pos < search_state.test_pos.end)
|
---|
| 2685 | /*|*/ break;
|
---|
| 2686 | #endif
|
---|
| 2687 | }
|
---|
| 2688 | /* do interrupted for entry point... */
|
---|
| 2689 | case rx_outer_restore_pos:
|
---|
| 2690 | {
|
---|
| 2691 | int x;
|
---|
| 2692 | x = get_burst (&search_state.outer_pos, app_closure, stop);
|
---|
| 2693 | switch (x)
|
---|
| 2694 | {
|
---|
| 2695 | case rx_get_burst_continuation:
|
---|
| 2696 | pc = rx_outer_restore_pos;
|
---|
| 2697 | goto return_continuation;
|
---|
| 2698 | case rx_get_burst_error:
|
---|
| 2699 | search_state.ret_val = rx_search_error;
|
---|
| 2700 | goto finish;
|
---|
| 2701 | case rx_get_burst_no_more:
|
---|
| 2702 | if (rxb->can_match_empty)
|
---|
| 2703 | break;
|
---|
| 2704 | goto finish;
|
---|
| 2705 | case rx_get_burst_ok:
|
---|
| 2706 | break;
|
---|
| 2707 | }
|
---|
| 2708 | } /* } while (...see below...) */
|
---|
| 2709 |
|
---|
| 2710 | if ((search_state.outer_pos.search_direction == 1)
|
---|
| 2711 | ? (startpos < search_state.outer_pos.search_end)
|
---|
| 2712 | : (startpos > search_state.outer_pos.search_end))
|
---|
[23508] | 2713 | goto pseudo_do;
|
---|
[3745] | 2714 |
|
---|
| 2715 |
|
---|
| 2716 | finish:
|
---|
| 2717 | uninit_fastmap (rxb, &search_state);
|
---|
| 2718 | if (search_state.start_super)
|
---|
| 2719 | rx_unlock_superstate (&rxb->rx, search_state.start_super);
|
---|
| 2720 |
|
---|
| 2721 | #ifdef regex_malloc
|
---|
| 2722 | if (search_state.lparen) free (search_state.lparen);
|
---|
| 2723 | if (search_state.rparen) free (search_state.rparen);
|
---|
| 2724 | if (search_state.best_lpspace) free (search_state.best_lpspace);
|
---|
| 2725 | if (search_state.best_rpspace) free (search_state.best_rpspace);
|
---|
| 2726 | #endif
|
---|
| 2727 | return search_state.ret_val;
|
---|
| 2728 | }
|
---|
| 2729 |
|
---|
| 2730 |
|
---|
| 2731 | test_match:
|
---|
| 2732 | {
|
---|
| 2733 | enum rx_test_match_entry test_pc;
|
---|
| 2734 | intptr_t inx;
|
---|
| 2735 | test_pc = search_state.test_match_resume_pt;
|
---|
| 2736 | if (test_pc == rx_test_start)
|
---|
| 2737 | {
|
---|
| 2738 | #ifdef RX_DEBUG
|
---|
| 2739 | search_state.backtrack_depth = 0;
|
---|
| 2740 | #endif
|
---|
| 2741 | search_state.last_l = search_state.last_r = 0;
|
---|
| 2742 | search_state.lparen[0] = startpos;
|
---|
| 2743 | search_state.super = search_state.start_super;
|
---|
| 2744 | search_state.c = search_state.nfa_choice;
|
---|
| 2745 | search_state.test_pos.pos = search_state.outer_pos.pos - 1;
|
---|
| 2746 | search_state.test_pos.string = search_state.outer_pos.string;
|
---|
| 2747 | search_state.test_pos.end = search_state.outer_pos.end;
|
---|
| 2748 | search_state.test_pos.offset = search_state.outer_pos.offset;
|
---|
| 2749 | search_state.test_pos.size = search_state.outer_pos.size;
|
---|
| 2750 | search_state.test_pos.search_direction = 1;
|
---|
| 2751 | search_state.counter_stack = 0;
|
---|
| 2752 | search_state.backtrack_stack = 0;
|
---|
| 2753 | search_state.backtrack_frame_bytes =
|
---|
| 2754 | (sizeof (struct rx_backtrack_frame)
|
---|
| 2755 | + (rxb->match_regs_on_stack
|
---|
| 2756 | ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2
|
---|
| 2757 | : 0));
|
---|
| 2758 | search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64;
|
---|
| 2759 | search_state.test_ret = rx_test_line_finished;
|
---|
| 2760 | search_state.could_have_continued = 0;
|
---|
| 2761 | }
|
---|
| 2762 | /* This is while (1)...except that the body of the loop is interrupted
|
---|
| 2763 | * by some alternative entry points.
|
---|
| 2764 | */
|
---|
| 2765 | pseudo_while_1:
|
---|
| 2766 | switch (test_pc)
|
---|
| 2767 | {
|
---|
| 2768 | case rx_test_cache_hit_loop:
|
---|
| 2769 | goto resume_continuation_1;
|
---|
| 2770 | case rx_test_backreference_check:
|
---|
| 2771 | goto resume_continuation_2;
|
---|
| 2772 | case rx_test_backtrack_return:
|
---|
| 2773 | goto resume_continuation_3;
|
---|
| 2774 | case rx_test_start:
|
---|
| 2775 | #ifdef RX_DEBUG
|
---|
| 2776 | /* There is a search tree with every node as set of deterministic
|
---|
| 2777 | * transitions in the super nfa. For every branch of a
|
---|
| 2778 | * backtrack point is an edge in the tree.
|
---|
| 2779 | * This counts up a pre-order of nodes in that tree.
|
---|
| 2780 | * It's saved on the search stack and printed when debugging.
|
---|
| 2781 | */
|
---|
| 2782 | search_state.line_no = 0;
|
---|
| 2783 | search_state.lines_found = 0;
|
---|
| 2784 | #endif
|
---|
| 2785 |
|
---|
| 2786 | top_of_cycle:
|
---|
| 2787 | /* A superstate is basicly a transition table, indexed by
|
---|
| 2788 | * characters from the string being tested, and containing
|
---|
| 2789 | * RX_INX (`instruction frame') structures.
|
---|
| 2790 | */
|
---|
| 2791 | search_state.ifr = &search_state.super->transitions [search_state.c];
|
---|
| 2792 |
|
---|
| 2793 | recurse_test_match:
|
---|
| 2794 | /* This is the point to which control is sent when the
|
---|
| 2795 | * test matcher `recurses'. Before jumping here, some variables
|
---|
| 2796 | * need to be saved on the stack and the next instruction frame
|
---|
| 2797 | * has to be computed.
|
---|
| 2798 | */
|
---|
| 2799 |
|
---|
| 2800 | restart:
|
---|
| 2801 | /* Some instructions don't advance the matcher, but just
|
---|
| 2802 | * carry out some side effects and fetch a new instruction.
|
---|
| 2803 | * To dispatch that new instruction, `goto restart'.
|
---|
| 2804 | */
|
---|
| 2805 |
|
---|
| 2806 | {
|
---|
| 2807 | struct rx_inx * next_tr_table;
|
---|
| 2808 | struct rx_inx * this_tr_table;
|
---|
| 2809 | /* The fastest route through the loop is when the instruction
|
---|
| 2810 | * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA
|
---|
| 2811 | * is non-zero. In that case, it points to the next
|
---|
| 2812 | * superstate.
|
---|
[23508] | 2813 | *
|
---|
[3745] | 2814 | * This allows us to not bother fetching the bytecode.
|
---|
| 2815 | */
|
---|
| 2816 | next_tr_table = (struct rx_inx *)search_state.ifr->data;
|
---|
| 2817 | this_tr_table = search_state.super->transitions;
|
---|
| 2818 | while (next_tr_table)
|
---|
| 2819 | {
|
---|
| 2820 | #ifdef RX_DEBUG_0
|
---|
| 2821 | if (rx_debug_trace)
|
---|
| 2822 | {
|
---|
| 2823 | struct rx_superset * setp;
|
---|
| 2824 |
|
---|
| 2825 | fprintf (stderr, "%d %d>> re_next_char @ %d (%d)",
|
---|
| 2826 | search_state.line_no,
|
---|
| 2827 | search_state.backtrack_depth,
|
---|
| 2828 | (search_state.test_pos.pos - search_state.test_pos.string
|
---|
| 2829 | + search_state.test_pos.offset), search_state.c);
|
---|
| 2830 |
|
---|
| 2831 | search_state.super =
|
---|
| 2832 | ((struct rx_superstate *)
|
---|
| 2833 | ((char *)this_tr_table
|
---|
| 2834 | - ((mg_u_long)
|
---|
| 2835 | ((struct rx_superstate *)0)->transitions)));
|
---|
| 2836 |
|
---|
| 2837 | setp = search_state.super->contents;
|
---|
| 2838 | fprintf (stderr, " superstet (rx=%d, &=%x: ",
|
---|
| 2839 | rxb->rx.rx_id, setp);
|
---|
| 2840 | while (setp)
|
---|
| 2841 | {
|
---|
| 2842 | fprintf (stderr, "%d ", setp->id);
|
---|
| 2843 | setp = setp->cdr;
|
---|
| 2844 | }
|
---|
| 2845 | fprintf (stderr, "\n");
|
---|
| 2846 | }
|
---|
| 2847 | #endif
|
---|
| 2848 | this_tr_table = next_tr_table;
|
---|
| 2849 | ++search_state.test_pos.pos;
|
---|
| 2850 | if (search_state.test_pos.pos == search_state.test_pos.end)
|
---|
| 2851 | {
|
---|
| 2852 | int burst_state;
|
---|
| 2853 | try_burst_1:
|
---|
| 2854 | burst_state = get_burst (&search_state.test_pos,
|
---|
| 2855 | app_closure, stop);
|
---|
| 2856 | switch (burst_state)
|
---|
| 2857 | {
|
---|
| 2858 | case rx_get_burst_continuation:
|
---|
| 2859 | search_state.saved_this_tr_table = this_tr_table;
|
---|
| 2860 | search_state.saved_next_tr_table = next_tr_table;
|
---|
| 2861 | test_pc = rx_test_cache_hit_loop;
|
---|
| 2862 | goto test_return_continuation;
|
---|
| 2863 |
|
---|
| 2864 | resume_continuation_1:
|
---|
| 2865 | /* Continuation one jumps here to do its work: */
|
---|
| 2866 | search_state.saved_this_tr_table = this_tr_table;
|
---|
| 2867 | search_state.saved_next_tr_table = next_tr_table;
|
---|
| 2868 | goto try_burst_1;
|
---|
| 2869 |
|
---|
| 2870 | case rx_get_burst_ok:
|
---|
| 2871 | /* get_burst succeeded...keep going */
|
---|
| 2872 | break;
|
---|
| 2873 |
|
---|
| 2874 | case rx_get_burst_no_more:
|
---|
| 2875 | search_state.test_ret = rx_test_line_finished;
|
---|
| 2876 | search_state.could_have_continued = 1;
|
---|
| 2877 | goto test_do_return;
|
---|
[23508] | 2878 |
|
---|
[3745] | 2879 | case rx_get_burst_error:
|
---|
| 2880 | /* An error... */
|
---|
| 2881 | search_state.test_ret = rx_test_internal_error;
|
---|
| 2882 | goto test_do_return;
|
---|
| 2883 | }
|
---|
| 2884 | }
|
---|
[23508] | 2885 | search_state.c = *search_state.test_pos.pos;
|
---|
[3745] | 2886 | search_state.ifr = this_tr_table + search_state.c;
|
---|
| 2887 | next_tr_table = (struct rx_inx *)search_state.ifr->data;
|
---|
| 2888 | } /* Fast loop through cached transition tables */
|
---|
| 2889 |
|
---|
| 2890 | /* Here when we ran out of cached next-char transitions.
|
---|
| 2891 | * So, it will be necessary to do a more expensive
|
---|
| 2892 | * dispatch on the current instruction. The superstate
|
---|
| 2893 | * pointer is allowed to become invalid during next-char
|
---|
| 2894 | * transitions -- now we must bring it up to date.
|
---|
| 2895 | */
|
---|
| 2896 | search_state.super =
|
---|
| 2897 | ((struct rx_superstate *)
|
---|
| 2898 | ((char *)this_tr_table
|
---|
| 2899 | - ((uintptr_t)
|
---|
| 2900 | ((struct rx_superstate *)0)->transitions)));
|
---|
| 2901 | }
|
---|
| 2902 |
|
---|
| 2903 | /* We've encountered an instruction other than next-char.
|
---|
| 2904 | * Dispatch that instruction:
|
---|
| 2905 | */
|
---|
| 2906 | inx = (intptr_t)search_state.ifr->inx;
|
---|
| 2907 | #ifdef RX_DEBUG_0
|
---|
| 2908 | if (rx_debug_trace)
|
---|
| 2909 | {
|
---|
| 2910 | struct rx_superset * setp = search_state.super->contents;
|
---|
| 2911 |
|
---|
| 2912 | fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no,
|
---|
| 2913 | search_state.backtrack_depth,
|
---|
| 2914 | inx_names[inx],
|
---|
| 2915 | (search_state.test_pos.pos - search_state.test_pos.string
|
---|
| 2916 | + (test_pos.half == 0 ? 0 : size1)), search_state.c);
|
---|
| 2917 |
|
---|
| 2918 | fprintf (stderr, " superstet (rx=%d, &=%x: ",
|
---|
| 2919 | rxb->rx.rx_id, setp);
|
---|
| 2920 | while (setp)
|
---|
| 2921 | {
|
---|
| 2922 | fprintf (stderr, "%d ", setp->id);
|
---|
[23508] | 2923 | setp = setp->cdr;
|
---|
| 2924 | }
|
---|
[3745] | 2925 | fprintf (stderr, "\n");
|
---|
| 2926 | }
|
---|
| 2927 | #endif
|
---|
| 2928 | switch ((enum rx_opcode)inx)
|
---|
| 2929 | {
|
---|
| 2930 | case rx_do_side_effects:
|
---|
| 2931 |
|
---|
| 2932 | /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon
|
---|
| 2933 | * edges associated with parentheses, backreferencing, etc.
|
---|
| 2934 | */
|
---|
| 2935 | {
|
---|
| 2936 | struct rx_distinct_future * df =
|
---|
| 2937 | (struct rx_distinct_future *)search_state.ifr->data_2;
|
---|
| 2938 | struct rx_se_list * el = df->effects;
|
---|
| 2939 | /* Side effects come in lists. This walks down
|
---|
| 2940 | * a list, dispatching.
|
---|
| 2941 | */
|
---|
| 2942 | while (el)
|
---|
| 2943 | {
|
---|
| 2944 | intptr_t effect;
|
---|
| 2945 | effect = (intptr_t)el->car;
|
---|
| 2946 | if (effect < 0)
|
---|
| 2947 | {
|
---|
| 2948 | #ifdef RX_DEBUG_0
|
---|
| 2949 | if (rx_debug_trace)
|
---|
| 2950 | {
|
---|
| 2951 | struct rx_superset * setp = search_state.super->contents;
|
---|
| 2952 |
|
---|
| 2953 | fprintf (stderr, "....%d %d>> %s\n", search_state.line_no,
|
---|
| 2954 | search_state.backtrack_depth,
|
---|
| 2955 | efnames[-effect]);
|
---|
| 2956 | }
|
---|
| 2957 | #endif
|
---|
| 2958 | switch ((enum re_side_effects) effect)
|
---|
| 2959 |
|
---|
| 2960 | {
|
---|
| 2961 | case re_se_pushback:
|
---|
| 2962 | search_state.ifr = &df->future_frame;
|
---|
| 2963 | if (!search_state.ifr->data)
|
---|
[23508] | 2964 | {
|
---|
[3745] | 2965 | struct rx_superstate * sup;
|
---|
| 2966 | sup = search_state.super;
|
---|
| 2967 | rx_lock_superstate (rx, sup);
|
---|
| 2968 | if (!rx_handle_cache_miss (&rxb->rx,
|
---|
| 2969 | search_state.super,
|
---|
| 2970 | search_state.c,
|
---|
| 2971 | (search_state.ifr
|
---|
| 2972 | ->data_2)))
|
---|
| 2973 | {
|
---|
| 2974 | rx_unlock_superstate (rx, sup);
|
---|
| 2975 | search_state.test_ret = rx_test_internal_error;
|
---|
| 2976 | goto test_do_return;
|
---|
| 2977 | }
|
---|
| 2978 | rx_unlock_superstate (rx, sup);
|
---|
| 2979 | }
|
---|
| 2980 | /* --search_state.test_pos.pos; */
|
---|
| 2981 | search_state.c = 't';
|
---|
| 2982 | search_state.super
|
---|
| 2983 | = ((struct rx_superstate *)
|
---|
| 2984 | ((char *)search_state.ifr->data
|
---|
| 2985 | - (intptr_t)(((struct rx_superstate *)0)
|
---|
| 2986 | ->transitions)));
|
---|
| 2987 | goto top_of_cycle;
|
---|
| 2988 | break;
|
---|
| 2989 | case re_se_push0:
|
---|
| 2990 | {
|
---|
| 2991 | struct rx_counter_frame * old_cf
|
---|
| 2992 | = (search_state.counter_stack
|
---|
| 2993 | ? ((struct rx_counter_frame *)
|
---|
| 2994 | search_state.counter_stack->sp)
|
---|
| 2995 | : 0);
|
---|
| 2996 | struct rx_counter_frame * cf;
|
---|
| 2997 | PUSH (search_state.counter_stack,
|
---|
| 2998 | sizeof (struct rx_counter_frame));
|
---|
| 2999 | cf = ((struct rx_counter_frame *)
|
---|
| 3000 | search_state.counter_stack->sp);
|
---|
| 3001 | cf->tag = re_se_iter;
|
---|
| 3002 | cf->val = 0;
|
---|
| 3003 | cf->inherited_from = 0;
|
---|
| 3004 | cf->cdr = old_cf;
|
---|
| 3005 | break;
|
---|
| 3006 | }
|
---|
| 3007 | case re_se_fail:
|
---|
| 3008 | goto test_do_return;
|
---|
| 3009 | case re_se_begbuf:
|
---|
| 3010 | if (!AT_STRINGS_BEG ())
|
---|
| 3011 | goto test_do_return;
|
---|
| 3012 | break;
|
---|
| 3013 | case re_se_endbuf:
|
---|
| 3014 | if (!AT_STRINGS_END ())
|
---|
| 3015 | goto test_do_return;
|
---|
| 3016 | break;
|
---|
| 3017 | case re_se_wordbeg:
|
---|
| 3018 | if ( LETTER_P (&search_state.test_pos, 1)
|
---|
| 3019 | && ( AT_STRINGS_BEG()
|
---|
| 3020 | || !LETTER_P (&search_state.test_pos, 0)))
|
---|
| 3021 | break;
|
---|
| 3022 | else
|
---|
| 3023 | goto test_do_return;
|
---|
| 3024 | case re_se_wordend:
|
---|
| 3025 | if ( !AT_STRINGS_BEG ()
|
---|
| 3026 | && LETTER_P (&search_state.test_pos, 0)
|
---|
| 3027 | && (AT_STRINGS_END ()
|
---|
| 3028 | || !LETTER_P (&search_state.test_pos, 1)))
|
---|
| 3029 | break;
|
---|
| 3030 | else
|
---|
| 3031 | goto test_do_return;
|
---|
| 3032 | case re_se_wordbound:
|
---|
| 3033 | if (AT_WORD_BOUNDARY (&search_state.test_pos))
|
---|
| 3034 | break;
|
---|
| 3035 | else
|
---|
| 3036 | goto test_do_return;
|
---|
| 3037 | case re_se_notwordbound:
|
---|
| 3038 | if (!AT_WORD_BOUNDARY (&search_state.test_pos))
|
---|
| 3039 | break;
|
---|
| 3040 | else
|
---|
| 3041 | goto test_do_return;
|
---|
| 3042 | case re_se_hat:
|
---|
| 3043 | if (AT_STRINGS_BEG ())
|
---|
| 3044 | {
|
---|
| 3045 | if (rxb->not_bol)
|
---|
| 3046 | goto test_do_return;
|
---|
| 3047 | else
|
---|
| 3048 | break;
|
---|
| 3049 | }
|
---|
| 3050 | else
|
---|
| 3051 | {
|
---|
| 3052 | char pos_c = *search_state.test_pos.pos;
|
---|
| 3053 | if ( (SRCH_TRANSLATE (pos_c)
|
---|
| 3054 | == SRCH_TRANSLATE('\n'))
|
---|
| 3055 | && rxb->newline_anchor)
|
---|
| 3056 | break;
|
---|
| 3057 | else
|
---|
| 3058 | goto test_do_return;
|
---|
| 3059 | }
|
---|
| 3060 | case re_se_dollar:
|
---|
| 3061 | if (AT_STRINGS_END ())
|
---|
| 3062 | {
|
---|
| 3063 | if (rxb->not_eol)
|
---|
| 3064 | goto test_do_return;
|
---|
| 3065 | else
|
---|
| 3066 | break;
|
---|
| 3067 | }
|
---|
| 3068 | else
|
---|
| 3069 | {
|
---|
| 3070 | if ( ( SRCH_TRANSLATE (fetch_char
|
---|
| 3071 | (&search_state.test_pos, 1,
|
---|
| 3072 | app_closure, stop))
|
---|
| 3073 | == SRCH_TRANSLATE ('\n'))
|
---|
| 3074 | && rxb->newline_anchor)
|
---|
| 3075 | break;
|
---|
| 3076 | else
|
---|
| 3077 | goto test_do_return;
|
---|
| 3078 | }
|
---|
| 3079 |
|
---|
| 3080 | case re_se_try:
|
---|
| 3081 | /* This is the first side effect in every
|
---|
| 3082 | * expression.
|
---|
| 3083 | *
|
---|
| 3084 | * FOR NO GOOD REASON...get rid of it...
|
---|
| 3085 | */
|
---|
| 3086 | break;
|
---|
| 3087 |
|
---|
| 3088 | case re_se_pushpos:
|
---|
| 3089 | {
|
---|
| 3090 | int urhere =
|
---|
| 3091 | ((int)(search_state.test_pos.pos
|
---|
| 3092 | - search_state.test_pos.string)
|
---|
| 3093 | + search_state.test_pos.offset);
|
---|
| 3094 | struct rx_counter_frame * old_cf
|
---|
| 3095 | = (search_state.counter_stack
|
---|
| 3096 | ? ((struct rx_counter_frame *)
|
---|
| 3097 | search_state.counter_stack->sp)
|
---|
| 3098 | : 0);
|
---|
| 3099 | struct rx_counter_frame * cf;
|
---|
| 3100 | PUSH(search_state.counter_stack,
|
---|
| 3101 | sizeof (struct rx_counter_frame));
|
---|
| 3102 | cf = ((struct rx_counter_frame *)
|
---|
| 3103 | search_state.counter_stack->sp);
|
---|
| 3104 | cf->tag = re_se_pushpos;
|
---|
| 3105 | cf->val = urhere;
|
---|
| 3106 | cf->inherited_from = 0;
|
---|
| 3107 | cf->cdr = old_cf;
|
---|
| 3108 | break;
|
---|
| 3109 | }
|
---|
| 3110 |
|
---|
| 3111 | case re_se_chkpos:
|
---|
| 3112 | {
|
---|
| 3113 | int urhere =
|
---|
| 3114 | ((int)(search_state.test_pos.pos
|
---|
| 3115 | - search_state.test_pos.string)
|
---|
| 3116 | + search_state.test_pos.offset);
|
---|
| 3117 | struct rx_counter_frame * cf
|
---|
| 3118 | = ((struct rx_counter_frame *)
|
---|
| 3119 | search_state.counter_stack->sp);
|
---|
| 3120 | if (cf->val == urhere)
|
---|
| 3121 | goto test_do_return;
|
---|
| 3122 | cf->val = urhere;
|
---|
| 3123 | break;
|
---|
| 3124 | }
|
---|
| 3125 | break;
|
---|
| 3126 |
|
---|
| 3127 | case re_se_poppos:
|
---|
| 3128 | POP(search_state.counter_stack,
|
---|
| 3129 | sizeof (struct rx_counter_frame));
|
---|
| 3130 | break;
|
---|
| 3131 |
|
---|
| 3132 |
|
---|
| 3133 | case re_se_at_dot:
|
---|
| 3134 | case re_se_syntax:
|
---|
| 3135 | case re_se_not_syntax:
|
---|
| 3136 | #ifdef emacs
|
---|
| 3137 | /*
|
---|
| 3138 | * this release lacks emacs support
|
---|
| 3139 | */
|
---|
| 3140 | #endif
|
---|
| 3141 | break;
|
---|
| 3142 | case re_se_win:
|
---|
| 3143 | case re_se_lparen:
|
---|
| 3144 | case re_se_rparen:
|
---|
| 3145 | case re_se_backref:
|
---|
| 3146 | case re_se_iter:
|
---|
| 3147 | case re_se_end_iter:
|
---|
| 3148 | case re_se_tv:
|
---|
| 3149 | case re_floogle_flap:
|
---|
| 3150 | search_state.ret_val = 0;
|
---|
| 3151 | goto test_do_return;
|
---|
| 3152 | }
|
---|
| 3153 | }
|
---|
| 3154 | else
|
---|
| 3155 | {
|
---|
| 3156 | #ifdef RX_DEBUG_0
|
---|
| 3157 | if (rx_debug_trace)
|
---|
| 3158 | fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no,
|
---|
| 3159 | search_state.backtrack_depth,
|
---|
| 3160 | efnames2[rxb->se_params [effect].se],
|
---|
| 3161 | rxb->se_params [effect].op1,
|
---|
| 3162 | rxb->se_params [effect].op2);
|
---|
| 3163 | #endif
|
---|
| 3164 | switch (rxb->se_params [effect].se)
|
---|
| 3165 | {
|
---|
| 3166 | case re_se_win:
|
---|
| 3167 | /* This side effect indicates that we've
|
---|
| 3168 | * found a match, though not necessarily the
|
---|
| 3169 | * best match. This is a fancy assignment to
|
---|
| 3170 | * register 0 unless the caller didn't
|
---|
| 3171 | * care about registers. In which case,
|
---|
| 3172 | * this stops the match.
|
---|
| 3173 | */
|
---|
| 3174 | {
|
---|
| 3175 | int urhere =
|
---|
| 3176 | ((int)(search_state.test_pos.pos
|
---|
| 3177 | - search_state.test_pos.string)
|
---|
| 3178 | + search_state.test_pos.offset);
|
---|
| 3179 |
|
---|
| 3180 | if ( (search_state.best_last_r < 0)
|
---|
| 3181 | || (urhere + 1 > search_state.best_rparen[0]))
|
---|
| 3182 | {
|
---|
| 3183 | /* Record the best known and keep
|
---|
| 3184 | * looking.
|
---|
| 3185 | */
|
---|
| 3186 | int x;
|
---|
| 3187 | for (x = 0; x <= search_state.last_l; ++x)
|
---|
| 3188 | search_state.best_lparen[x] = search_state.lparen[x];
|
---|
| 3189 | search_state.best_last_l = search_state.last_l;
|
---|
| 3190 | for (x = 0; x <= search_state.last_r; ++x)
|
---|
| 3191 | search_state.best_rparen[x] = search_state.rparen[x];
|
---|
| 3192 | search_state.best_rparen[0] = urhere + 1;
|
---|
| 3193 | search_state.best_last_r = search_state.last_r;
|
---|
| 3194 | }
|
---|
| 3195 | /* If we're not reporting the match-length
|
---|
| 3196 | * or other register info, we need look no
|
---|
| 3197 | * further.
|
---|
| 3198 | */
|
---|
| 3199 | if (search_state.first_found)
|
---|
| 3200 | {
|
---|
| 3201 | search_state.test_ret = rx_test_found_first;
|
---|
| 3202 | goto test_do_return;
|
---|
| 3203 | }
|
---|
| 3204 | }
|
---|
| 3205 | break;
|
---|
| 3206 | case re_se_lparen:
|
---|
| 3207 | {
|
---|
| 3208 | int urhere =
|
---|
| 3209 | ((int)(search_state.test_pos.pos
|
---|
| 3210 | - search_state.test_pos.string)
|
---|
| 3211 | + search_state.test_pos.offset);
|
---|
| 3212 |
|
---|
| 3213 | int reg = rxb->se_params [effect].op1;
|
---|
| 3214 | #if 0
|
---|
| 3215 | if (reg > search_state.last_l)
|
---|
| 3216 | #endif
|
---|
| 3217 | {
|
---|
| 3218 | search_state.lparen[reg] = urhere + 1;
|
---|
| 3219 | /* In addition to making this assignment,
|
---|
| 3220 | * we now know that lower numbered regs
|
---|
| 3221 | * that haven't already been assigned,
|
---|
| 3222 | * won't be. We make sure they're
|
---|
| 3223 | * filled with -1, so they can be
|
---|
| 3224 | * recognized as unassigned.
|
---|
| 3225 | */
|
---|
| 3226 | if (search_state.last_l < reg)
|
---|
| 3227 | while (++search_state.last_l < reg)
|
---|
| 3228 | search_state.lparen[search_state.last_l] = -1;
|
---|
| 3229 | }
|
---|
| 3230 | break;
|
---|
| 3231 | }
|
---|
| 3232 |
|
---|
| 3233 | case re_se_rparen:
|
---|
| 3234 | {
|
---|
| 3235 | int urhere =
|
---|
| 3236 | ((int)(search_state.test_pos.pos
|
---|
| 3237 | - search_state.test_pos.string)
|
---|
| 3238 | + search_state.test_pos.offset);
|
---|
| 3239 | int reg = rxb->se_params [effect].op1;
|
---|
| 3240 | search_state.rparen[reg] = urhere + 1;
|
---|
| 3241 | if (search_state.last_r < reg)
|
---|
| 3242 | {
|
---|
| 3243 | while (++search_state.last_r < reg)
|
---|
| 3244 | search_state.rparen[search_state.last_r]
|
---|
| 3245 | = -1;
|
---|
| 3246 | }
|
---|
| 3247 | break;
|
---|
| 3248 | }
|
---|
| 3249 |
|
---|
| 3250 | case re_se_backref:
|
---|
| 3251 | {
|
---|
| 3252 | int reg = rxb->se_params [effect].op1;
|
---|
| 3253 | if ( reg > search_state.last_r
|
---|
| 3254 | || search_state.rparen[reg] < 0)
|
---|
| 3255 | goto test_do_return;
|
---|
| 3256 |
|
---|
| 3257 | {
|
---|
| 3258 | int backref_status;
|
---|
| 3259 | check_backreference:
|
---|
| 3260 | backref_status
|
---|
| 3261 | = back_check (&search_state.test_pos,
|
---|
| 3262 | search_state.lparen[reg],
|
---|
| 3263 | search_state.rparen[reg],
|
---|
| 3264 | search_state.translate,
|
---|
| 3265 | app_closure,
|
---|
| 3266 | stop);
|
---|
| 3267 | switch (backref_status)
|
---|
| 3268 | {
|
---|
| 3269 | case rx_back_check_continuation:
|
---|
| 3270 | search_state.saved_reg = reg;
|
---|
| 3271 | test_pc = rx_test_backreference_check;
|
---|
| 3272 | goto test_return_continuation;
|
---|
| 3273 | resume_continuation_2:
|
---|
| 3274 | reg = search_state.saved_reg;
|
---|
| 3275 | goto check_backreference;
|
---|
| 3276 | case rx_back_check_fail:
|
---|
| 3277 | /* Fail */
|
---|
| 3278 | goto test_do_return;
|
---|
| 3279 | case rx_back_check_pass:
|
---|
| 3280 | /* pass --
|
---|
| 3281 | * test_pos now advanced to last
|
---|
| 3282 | * char matched by backref
|
---|
| 3283 | */
|
---|
| 3284 | break;
|
---|
| 3285 | }
|
---|
| 3286 | }
|
---|
| 3287 | break;
|
---|
| 3288 | }
|
---|
| 3289 | case re_se_iter:
|
---|
| 3290 | {
|
---|
| 3291 | struct rx_counter_frame * csp
|
---|
| 3292 | = ((struct rx_counter_frame *)
|
---|
| 3293 | search_state.counter_stack->sp);
|
---|
| 3294 | if (csp->val == rxb->se_params[effect].op2)
|
---|
| 3295 | goto test_do_return;
|
---|
| 3296 | else
|
---|
| 3297 | ++csp->val;
|
---|
| 3298 | break;
|
---|
| 3299 | }
|
---|
| 3300 | case re_se_end_iter:
|
---|
| 3301 | {
|
---|
| 3302 | struct rx_counter_frame * csp
|
---|
| 3303 | = ((struct rx_counter_frame *)
|
---|
| 3304 | search_state.counter_stack->sp);
|
---|
| 3305 | if (csp->val < rxb->se_params[effect].op1)
|
---|
| 3306 | goto test_do_return;
|
---|
| 3307 | else
|
---|
| 3308 | {
|
---|
| 3309 | struct rx_counter_frame * source = csp;
|
---|
| 3310 | while (source->inherited_from)
|
---|
| 3311 | source = source->inherited_from;
|
---|
| 3312 | if (!source || !source->cdr)
|
---|
| 3313 | {
|
---|
| 3314 | POP(search_state.counter_stack,
|
---|
| 3315 | sizeof(struct rx_counter_frame));
|
---|
| 3316 | }
|
---|
| 3317 | else
|
---|
| 3318 | {
|
---|
| 3319 | source = source->cdr;
|
---|
| 3320 | csp->val = source->val;
|
---|
| 3321 | csp->tag = source->tag;
|
---|
| 3322 | csp->cdr = 0;
|
---|
| 3323 | csp->inherited_from = source;
|
---|
| 3324 | }
|
---|
| 3325 | }
|
---|
| 3326 | break;
|
---|
| 3327 | }
|
---|
| 3328 | case re_se_tv:
|
---|
| 3329 | /* is a noop */
|
---|
| 3330 | break;
|
---|
| 3331 | case re_se_try:
|
---|
| 3332 | case re_se_pushback:
|
---|
| 3333 | case re_se_push0:
|
---|
| 3334 | case re_se_pushpos:
|
---|
| 3335 | case re_se_chkpos:
|
---|
| 3336 | case re_se_poppos:
|
---|
| 3337 | case re_se_at_dot:
|
---|
| 3338 | case re_se_syntax:
|
---|
| 3339 | case re_se_not_syntax:
|
---|
| 3340 | case re_se_begbuf:
|
---|
| 3341 | case re_se_hat:
|
---|
| 3342 | case re_se_wordbeg:
|
---|
| 3343 | case re_se_wordbound:
|
---|
| 3344 | case re_se_notwordbound:
|
---|
| 3345 | case re_se_wordend:
|
---|
| 3346 | case re_se_endbuf:
|
---|
| 3347 | case re_se_dollar:
|
---|
| 3348 | case re_se_fail:
|
---|
| 3349 | case re_floogle_flap:
|
---|
| 3350 | search_state.ret_val = 0;
|
---|
| 3351 | goto test_do_return;
|
---|
| 3352 | }
|
---|
| 3353 | }
|
---|
| 3354 | el = el->cdr;
|
---|
| 3355 | }
|
---|
| 3356 | /* Now the side effects are done,
|
---|
| 3357 | * so get the next instruction.
|
---|
| 3358 | * and move on.
|
---|
| 3359 | */
|
---|
| 3360 | search_state.ifr = &df->future_frame;
|
---|
| 3361 | goto restart;
|
---|
| 3362 | }
|
---|
| 3363 |
|
---|
| 3364 | case rx_backtrack_point:
|
---|
| 3365 | {
|
---|
| 3366 | /* A backtrack point indicates that we've reached a
|
---|
| 3367 | * non-determinism in the superstate NFA. This is a
|
---|
| 3368 | * loop that exhaustively searches the possibilities.
|
---|
| 3369 | *
|
---|
| 3370 | * A backtracking strategy is used. We keep track of what
|
---|
| 3371 | * registers are valid so we can erase side effects.
|
---|
| 3372 | *
|
---|
| 3373 | * First, make sure there is some stack space to hold
|
---|
| 3374 | * our state.
|
---|
| 3375 | */
|
---|
| 3376 |
|
---|
| 3377 | struct rx_backtrack_frame * bf;
|
---|
| 3378 |
|
---|
| 3379 | PUSH(search_state.backtrack_stack,
|
---|
| 3380 | search_state.backtrack_frame_bytes);
|
---|
| 3381 | #ifdef RX_DEBUG_0
|
---|
| 3382 | ++search_state.backtrack_depth;
|
---|
| 3383 | #endif
|
---|
| 3384 |
|
---|
| 3385 | bf = ((struct rx_backtrack_frame *)
|
---|
| 3386 | search_state.backtrack_stack->sp);
|
---|
| 3387 | {
|
---|
| 3388 | bf->stk_super = search_state.super;
|
---|
| 3389 | /* We prevent the current superstate from being
|
---|
| 3390 | * deleted from the superstate cache.
|
---|
| 3391 | */
|
---|
| 3392 | rx_lock_superstate (&rxb->rx, search_state.super);
|
---|
| 3393 | #ifdef RX_DEBUG_0
|
---|
| 3394 | bf->stk_search_state.line_no = search_state.line_no;
|
---|
| 3395 | #endif
|
---|
| 3396 | bf->stk_c = search_state.c;
|
---|
| 3397 | bf->stk_test_pos = search_state.test_pos;
|
---|
| 3398 | bf->stk_last_l = search_state.last_l;
|
---|
| 3399 | bf->stk_last_r = search_state.last_r;
|
---|
| 3400 | bf->df = ((struct rx_super_edge *)
|
---|
| 3401 | search_state.ifr->data_2)->options;
|
---|
| 3402 | bf->first_df = bf->df;
|
---|
| 3403 | bf->counter_stack_sp = (search_state.counter_stack
|
---|
| 3404 | ? search_state.counter_stack->sp
|
---|
| 3405 | : 0);
|
---|
| 3406 | bf->stk_test_ret = search_state.test_ret;
|
---|
| 3407 | if (rxb->match_regs_on_stack)
|
---|
| 3408 | {
|
---|
| 3409 | int x;
|
---|
| 3410 | regoff_t * stk =
|
---|
| 3411 | (regoff_t *)((char *)bf + sizeof (*bf));
|
---|
| 3412 | for (x = 0; x <= search_state.last_l; ++x)
|
---|
| 3413 | stk[x] = search_state.lparen[x];
|
---|
| 3414 | stk += x;
|
---|
| 3415 | for (x = 0; x <= search_state.last_r; ++x)
|
---|
| 3416 | stk[x] = search_state.rparen[x];
|
---|
| 3417 | }
|
---|
| 3418 | }
|
---|
| 3419 |
|
---|
| 3420 | /* Here is a while loop whose body is mainly a function
|
---|
| 3421 | * call and some code to handle a return from that
|
---|
| 3422 | * function.
|
---|
| 3423 | *
|
---|
| 3424 | * From here on for the rest of `case backtrack_point' it
|
---|
| 3425 | * is unsafe to assume that the search_state copies of
|
---|
| 3426 | * variables saved on the backtracking stack are valid
|
---|
| 3427 | * -- so read their values from the backtracking stack.
|
---|
| 3428 | *
|
---|
| 3429 | * This lets us use one generation fewer stack saves in
|
---|
| 3430 | * the call-graph of a search.
|
---|
| 3431 | */
|
---|
| 3432 |
|
---|
| 3433 | while_non_det_options:
|
---|
| 3434 | #ifdef RX_DEBUG_0
|
---|
| 3435 | ++search_state.lines_found;
|
---|
| 3436 | if (rx_debug_trace)
|
---|
| 3437 | fprintf (stderr, "@@@ %d calls %d @@@\n",
|
---|
| 3438 | search_state.line_no, search_state.lines_found);
|
---|
| 3439 |
|
---|
| 3440 | search_state.line_no = search_state.lines_found;
|
---|
| 3441 | #endif
|
---|
| 3442 |
|
---|
| 3443 | if (bf->df->next_same_super_edge[0] == bf->first_df)
|
---|
| 3444 | {
|
---|
| 3445 | /* This is a tail-call optimization -- we don't recurse
|
---|
| 3446 | * for the last of the possible futures.
|
---|
| 3447 | */
|
---|
| 3448 | search_state.ifr = (bf->df->effects
|
---|
| 3449 | ? &bf->df->side_effects_frame
|
---|
| 3450 | : &bf->df->future_frame);
|
---|
| 3451 |
|
---|
| 3452 | rx_unlock_superstate (&rxb->rx, search_state.super);
|
---|
| 3453 | POP(search_state.backtrack_stack,
|
---|
| 3454 | search_state.backtrack_frame_bytes);
|
---|
| 3455 | #ifdef RX_DEBUG
|
---|
| 3456 | --search_state.backtrack_depth;
|
---|
| 3457 | #endif
|
---|
| 3458 | goto restart;
|
---|
| 3459 | }
|
---|
| 3460 | else
|
---|
| 3461 | {
|
---|
| 3462 | if (search_state.counter_stack)
|
---|
| 3463 | {
|
---|
| 3464 | struct rx_counter_frame * old_cf
|
---|
| 3465 | = ((struct rx_counter_frame *)search_state.counter_stack->sp);
|
---|
| 3466 | struct rx_counter_frame * cf;
|
---|
| 3467 | PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame));
|
---|
| 3468 | cf = ((struct rx_counter_frame *)search_state.counter_stack->sp);
|
---|
| 3469 | cf->tag = old_cf->tag;
|
---|
| 3470 | cf->val = old_cf->val;
|
---|
| 3471 | cf->inherited_from = old_cf;
|
---|
| 3472 | cf->cdr = 0;
|
---|
| 3473 | }
|
---|
| 3474 | /* `Call' this test-match block */
|
---|
| 3475 | search_state.ifr = (bf->df->effects
|
---|
| 3476 | ? &bf->df->side_effects_frame
|
---|
| 3477 | : &bf->df->future_frame);
|
---|
| 3478 | goto recurse_test_match;
|
---|
| 3479 | }
|
---|
| 3480 |
|
---|
| 3481 | /* Returns in this block are accomplished by
|
---|
| 3482 | * goto test_do_return. There are two cases.
|
---|
| 3483 | * If there is some search-stack left,
|
---|
| 3484 | * then it is a return from a `recursive' call.
|
---|
| 3485 | * If there is no search-stack left, then
|
---|
| 3486 | * we should return to the fastmap/search loop.
|
---|
| 3487 | */
|
---|
| 3488 |
|
---|
| 3489 | test_do_return:
|
---|
| 3490 |
|
---|
| 3491 | if (!search_state.backtrack_stack)
|
---|
| 3492 | {
|
---|
| 3493 | #ifdef RX_DEBUG_0
|
---|
| 3494 | if (rx_debug_trace)
|
---|
| 3495 | fprintf (stderr, "!!! %d bails returning %d !!!\n",
|
---|
| 3496 | search_state.line_no, search_state.test_ret);
|
---|
| 3497 | #endif
|
---|
| 3498 |
|
---|
| 3499 | /* No more search-stack -- this test is done. */
|
---|
| 3500 | if (search_state.test_ret)
|
---|
| 3501 | goto return_from_test_match;
|
---|
| 3502 | else
|
---|
| 3503 | goto error_in_testing_match;
|
---|
| 3504 | }
|
---|
| 3505 |
|
---|
| 3506 | /* Returning from a recursive call to
|
---|
| 3507 | * the test match block:
|
---|
| 3508 | */
|
---|
| 3509 |
|
---|
| 3510 | bf = ((struct rx_backtrack_frame *)
|
---|
| 3511 | search_state.backtrack_stack->sp);
|
---|
| 3512 | #ifdef RX_DEBUG_0
|
---|
| 3513 | if (rx_debug_trace)
|
---|
| 3514 | fprintf (stderr, "+++ %d returns %d (to %d)+++\n",
|
---|
| 3515 | search_state.line_no,
|
---|
| 3516 | search_state.test_ret,
|
---|
| 3517 | bf->stk_search_state.line_no);
|
---|
| 3518 | #endif
|
---|
| 3519 |
|
---|
| 3520 | while (search_state.counter_stack
|
---|
| 3521 | && (!bf->counter_stack_sp
|
---|
| 3522 | || (bf->counter_stack_sp
|
---|
| 3523 | != search_state.counter_stack->sp)))
|
---|
| 3524 | {
|
---|
| 3525 | POP(search_state.counter_stack,
|
---|
| 3526 | sizeof (struct rx_counter_frame));
|
---|
| 3527 | }
|
---|
| 3528 |
|
---|
| 3529 | if (search_state.test_ret == rx_test_error)
|
---|
| 3530 | {
|
---|
| 3531 | POP (search_state.backtrack_stack,
|
---|
| 3532 | search_state.backtrack_frame_bytes);
|
---|
| 3533 | goto test_do_return;
|
---|
| 3534 | }
|
---|
| 3535 |
|
---|
| 3536 | /* If a non-longest match was found and that is good
|
---|
| 3537 | * enough, return immediately.
|
---|
| 3538 | */
|
---|
| 3539 | if ( (search_state.test_ret == rx_test_found_first)
|
---|
| 3540 | && search_state.first_found)
|
---|
| 3541 | {
|
---|
| 3542 | rx_unlock_superstate (&rxb->rx, bf->stk_super);
|
---|
| 3543 | POP (search_state.backtrack_stack,
|
---|
| 3544 | search_state.backtrack_frame_bytes);
|
---|
| 3545 | goto test_do_return;
|
---|
| 3546 | }
|
---|
| 3547 |
|
---|
| 3548 | search_state.test_ret = bf->stk_test_ret;
|
---|
| 3549 | search_state.last_l = bf->stk_last_l;
|
---|
| 3550 | search_state.last_r = bf->stk_last_r;
|
---|
| 3551 | bf->df = bf->df->next_same_super_edge[0];
|
---|
| 3552 | search_state.super = bf->stk_super;
|
---|
| 3553 | search_state.c = bf->stk_c;
|
---|
| 3554 | #ifdef RX_DEBUG_0
|
---|
| 3555 | search_state.line_no = bf->stk_search_state.line_no;
|
---|
| 3556 | #endif
|
---|
| 3557 |
|
---|
| 3558 | if (rxb->match_regs_on_stack)
|
---|
| 3559 | {
|
---|
| 3560 | int x;
|
---|
| 3561 | regoff_t * stk =
|
---|
| 3562 | (regoff_t *)((char *)bf + sizeof (*bf));
|
---|
| 3563 | for (x = 0; x <= search_state.last_l; ++x)
|
---|
| 3564 | search_state.lparen[x] = stk[x];
|
---|
| 3565 | stk += x;
|
---|
| 3566 | for (x = 0; x <= search_state.last_r; ++x)
|
---|
| 3567 | search_state.rparen[x] = stk[x];
|
---|
| 3568 | }
|
---|
| 3569 |
|
---|
| 3570 | {
|
---|
| 3571 | int x;
|
---|
| 3572 | try_burst_2:
|
---|
| 3573 | x = get_burst (&bf->stk_test_pos, app_closure, stop);
|
---|
| 3574 | switch (x)
|
---|
| 3575 | {
|
---|
| 3576 | case rx_get_burst_continuation:
|
---|
| 3577 | search_state.saved_bf = bf;
|
---|
| 3578 | test_pc = rx_test_backtrack_return;
|
---|
| 3579 | goto test_return_continuation;
|
---|
| 3580 | resume_continuation_3:
|
---|
| 3581 | bf = search_state.saved_bf;
|
---|
| 3582 | goto try_burst_2;
|
---|
| 3583 | case rx_get_burst_no_more:
|
---|
| 3584 | /* Since we've been here before, it is some kind of
|
---|
| 3585 | * error that we can't return.
|
---|
| 3586 | */
|
---|
| 3587 | case rx_get_burst_error:
|
---|
| 3588 | search_state.test_ret = rx_test_internal_error;
|
---|
| 3589 | goto test_do_return;
|
---|
| 3590 | case rx_get_burst_ok:
|
---|
| 3591 | break;
|
---|
| 3592 | }
|
---|
| 3593 | }
|
---|
| 3594 | search_state.test_pos = bf->stk_test_pos;
|
---|
| 3595 | goto while_non_det_options;
|
---|
| 3596 | }
|
---|
| 3597 |
|
---|
| 3598 |
|
---|
| 3599 | case rx_cache_miss:
|
---|
| 3600 | /* Because the superstate NFA is lazily constructed,
|
---|
| 3601 | * and in fact may erode from underneath us, we sometimes
|
---|
| 3602 | * have to construct the next instruction from the hard way.
|
---|
| 3603 | * This invokes one step in the lazy-conversion.
|
---|
| 3604 | */
|
---|
| 3605 | search_state.ifr = rx_handle_cache_miss (&rxb->rx,
|
---|
| 3606 | search_state.super,
|
---|
| 3607 | search_state.c,
|
---|
| 3608 | search_state.ifr->data_2);
|
---|
| 3609 | if (!search_state.ifr)
|
---|
| 3610 | {
|
---|
| 3611 | search_state.test_ret = rx_test_internal_error;
|
---|
| 3612 | goto test_do_return;
|
---|
| 3613 | }
|
---|
| 3614 | goto restart;
|
---|
| 3615 |
|
---|
| 3616 | case rx_backtrack:
|
---|
| 3617 | /* RX_BACKTRACK means that we've reached the empty
|
---|
| 3618 | * superstate, indicating that match can't succeed
|
---|
| 3619 | * from this point.
|
---|
| 3620 | */
|
---|
| 3621 | goto test_do_return;
|
---|
| 3622 |
|
---|
| 3623 | case rx_next_char:
|
---|
| 3624 | case rx_error_inx:
|
---|
| 3625 | case rx_num_instructions:
|
---|
| 3626 | search_state.ret_val = 0;
|
---|
| 3627 | goto test_do_return;
|
---|
| 3628 | }
|
---|
| 3629 | goto pseudo_while_1;
|
---|
| 3630 | }
|
---|
| 3631 |
|
---|
| 3632 | /* Healthy exits from the test-match loop do a
|
---|
| 3633 | * `goto return_from_test_match' On the other hand,
|
---|
| 3634 | * we might end up here.
|
---|
| 3635 | */
|
---|
| 3636 | error_in_testing_match:
|
---|
| 3637 | test_state = rx_test_error;
|
---|
| 3638 | goto test_returns_to_search;
|
---|
| 3639 |
|
---|
| 3640 | /***** fastmap/search loop body
|
---|
| 3641 | * considering the results testing for a match
|
---|
| 3642 | */
|
---|
| 3643 |
|
---|
| 3644 | return_from_test_match:
|
---|
| 3645 |
|
---|
| 3646 | if (search_state.best_last_l >= 0)
|
---|
| 3647 | {
|
---|
| 3648 | if (regs && (regs->start != search_state.best_lparen))
|
---|
| 3649 | {
|
---|
| 3650 | bcopy (search_state.best_lparen, regs->start,
|
---|
| 3651 | regs->num_regs * sizeof (int));
|
---|
| 3652 | bcopy (search_state.best_rparen, regs->end,
|
---|
| 3653 | regs->num_regs * sizeof (int));
|
---|
| 3654 | }
|
---|
| 3655 | if (regs && !rxb->no_sub)
|
---|
| 3656 | {
|
---|
| 3657 | int q;
|
---|
| 3658 | int bound = (regs->num_regs > search_state.num_regs
|
---|
| 3659 | ? regs->num_regs
|
---|
| 3660 | : search_state.num_regs);
|
---|
| 3661 | regoff_t * s = regs->start;
|
---|
| 3662 | regoff_t * e = regs->end;
|
---|
| 3663 | for (q = search_state.best_last_l + 1; q < bound; ++q)
|
---|
| 3664 | s[q] = e[q] = -1;
|
---|
| 3665 | }
|
---|
| 3666 | search_state.ret_val = search_state.best_lparen[0];
|
---|
| 3667 | test_state = rx_test_ok;
|
---|
| 3668 | goto test_returns_to_search;
|
---|
| 3669 | }
|
---|
| 3670 | else
|
---|
| 3671 | {
|
---|
| 3672 | test_state = rx_test_fail;
|
---|
| 3673 | goto test_returns_to_search;
|
---|
| 3674 | }
|
---|
| 3675 |
|
---|
| 3676 | test_return_continuation:
|
---|
| 3677 | search_state.test_match_resume_pt = test_pc;
|
---|
| 3678 | test_state = rx_test_continuation;
|
---|
| 3679 | goto test_returns_to_search;
|
---|
| 3680 | }
|
---|
| 3681 | }
|
---|
| 3682 |
|
---|
| 3683 |
|
---|
| 3684 |
|
---|
| 3685 | #endif /* RX_WANT_RX_DEFS */
|
---|
| 3686 |
|
---|
| 3687 |
|
---|
| 3688 |
|
---|
| 3689 | #else /* RX_WANT_SE_DEFS */
|
---|
| 3690 | /* Integers are used to represent side effects.
|
---|
| 3691 | *
|
---|
| 3692 | * Simple side effects are given negative integer names by these enums.
|
---|
| 3693 | *
|
---|
| 3694 | * Non-negative names are reserved for complex effects.
|
---|
| 3695 | *
|
---|
| 3696 | * Complex effects are those that take arguments. For example,
|
---|
| 3697 | * a register assignment associated with a group is complex because
|
---|
| 3698 | * it requires an argument to tell which group is being matched.
|
---|
| 3699 | *
|
---|
| 3700 | * The integer name of a complex effect is an index into rxb->se_params.
|
---|
| 3701 | */
|
---|
| 3702 |
|
---|
| 3703 | RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */
|
---|
| 3704 |
|
---|
| 3705 | RX_DEF_SE(0, re_se_pushback, = re_se_try - 1)
|
---|
| 3706 | RX_DEF_SE(0, re_se_push0, = re_se_pushback -1)
|
---|
| 3707 | RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1)
|
---|
| 3708 | RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1)
|
---|
| 3709 | RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1)
|
---|
| 3710 |
|
---|
| 3711 | RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */
|
---|
| 3712 | RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */
|
---|
| 3713 | RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */
|
---|
| 3714 |
|
---|
| 3715 | RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */
|
---|
| 3716 | RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */
|
---|
| 3717 |
|
---|
| 3718 | RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1)
|
---|
| 3719 | RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1)
|
---|
| 3720 | RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1)
|
---|
| 3721 |
|
---|
| 3722 | RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1)
|
---|
| 3723 | RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1)
|
---|
| 3724 |
|
---|
| 3725 | /* This fails except at the end of a line.
|
---|
| 3726 | * It deserves to go here since it is typicly one of the last steps
|
---|
| 3727 | * in a match.
|
---|
| 3728 | */
|
---|
| 3729 | RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1)
|
---|
| 3730 |
|
---|
| 3731 | /* Simple effects: */
|
---|
| 3732 | RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1)
|
---|
| 3733 |
|
---|
| 3734 | /* Complex effects. These are used in the 'se' field of
|
---|
| 3735 | * a struct re_se_params. Indexes into the se array
|
---|
| 3736 | * are stored as instructions on nfa edges.
|
---|
| 3737 | */
|
---|
| 3738 | RX_DEF_CPLX_SE(1, re_se_win, = 0)
|
---|
| 3739 | RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1)
|
---|
| 3740 | RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1)
|
---|
| 3741 | RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1)
|
---|
| 3742 | RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1)
|
---|
| 3743 | RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1)
|
---|
| 3744 | RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1)
|
---|
| 3745 |
|
---|
| 3746 | #endif
|
---|
| 3747 |
|
---|
| 3748 | #endif
|
---|