source: gsdl/trunk/trunk/mg/lib/regex.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 171.6 KB
Line 
1/* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22/* AIX requires this to be the first thing in the file. */
23#if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25#endif
26
27#define _GNU_SOURCE
28
29#ifdef HAVE_CONFIG_H
30# ifdef __WIN32__ /* [RPAP - Feb 97: WIN32 Port] */
31# include <win32cfg.h>
32# else
33# include <sysfuncs.h>
34# endif
35#endif
36
37/* We need this for `regex.h', and perhaps for the Emacs include files. */
38#include <sys/types.h>
39
40/* This is for other GNU distributions with internationalized messages. */
41#if HAVE_LIBINTL_H || defined (_LIBC)
42# include <libintl.h>
43#else
44# define gettext(msgid) (msgid)
45#endif
46
47/* The `emacs' switch turns on certain matching commands
48 that make sense only in Emacs. */
49#ifdef emacs
50
51#include "lisp.h"
52#include "buffer.h"
53#include "syntax.h"
54
55#else /* not emacs */
56
57/* If we are not linking with Emacs proper,
58 we can't use the relocating allocator
59 even if sysfuncs.h says that we can. */
60#undef REL_ALLOC
61
62#if defined (STDC_HEADERS) || defined (_LIBC)
63#include <stdlib.h>
64#else
65char *malloc ();
66char *realloc ();
67#endif
68
69/* We used to test for `BSTRING' here, but only GCC and Emacs define
70 `BSTRING', as far as I know, and neither of them use this code. */
71#ifndef INHIBIT_STRING_HEADER
72#if HAVE_STRING_H || STDC_HEADERS || defined (_LIBC)
73#include <string.h>
74#ifndef bcmp
75#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
76#endif
77#ifndef bcopy
78#define bcopy(s, d, n) memcpy ((d), (s), (n))
79#endif
80#ifndef bzero
81#define bzero(s, n) memset ((s), 0, (n))
82#endif
83#else
84#include <strings.h>
85#endif
86#endif
87
88/* Define the syntax stuff for \<, \>, etc. */
89
90/* This must be nonzero for the wordchar and notwordchar pattern
91 commands in re_match_2. */
92#ifndef Sword
93#define Sword 1
94#endif
95
96#ifdef SWITCH_ENUM_BUG
97#define SWITCH_ENUM_CAST(x) ((int)(x))
98#else
99#define SWITCH_ENUM_CAST(x) (x)
100#endif
101
102#ifdef SYNTAX_TABLE
103
104extern char *re_syntax_table;
105
106#else /* not SYNTAX_TABLE */
107
108/* How many characters in the character set. */
109#define CHAR_SET_SIZE 256
110
111static char re_syntax_table[CHAR_SET_SIZE];
112
113static void
114init_syntax_once ()
115{
116 register int c;
117 static int done = 0;
118
119 if (done)
120 return;
121
122 bzero (re_syntax_table, sizeof re_syntax_table);
123
124 for (c = 'a'; c <= 'z'; c++)
125 re_syntax_table[c] = Sword;
126
127 for (c = 'A'; c <= 'Z'; c++)
128 re_syntax_table[c] = Sword;
129
130 for (c = '0'; c <= '9'; c++)
131 re_syntax_table[c] = Sword;
132
133 re_syntax_table['_'] = Sword;
134
135 done = 1;
136}
137
138#endif /* not SYNTAX_TABLE */
139
140#define SYNTAX(c) re_syntax_table[c]
141
142#endif /* not emacs */
143
144
145/* Get the interface, including the syntax bits. */
146#include "regex.h"
147
148/* isalpha etc. are used for the character classes. */
149#include <ctype.h>
150
151/* Jim Meyering writes:
152
153 "... Some ctype macros are valid only for character codes that
154 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
155 using /bin/cc or gcc but without giving an ansi option). So, all
156 ctype uses should be through macros like ISPRINT... If
157 STDC_HEADERS is defined, then autoconf has verified that the ctype
158 macros don't need to be guarded with references to isascii. ...
159 Defining isascii to 1 should let any compiler worth its salt
160 eliminate the && through constant folding." */
161
162#ifndef ISASCII
163# if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
164# define ISASCII(c) 1
165# else
166# define ISASCII(c) isascii(c)
167# endif
168#endif
169
170#ifdef isblank
171#define ISBLANK(c) (ISASCII (c) && isblank (c))
172#else
173#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
174#endif
175#ifdef isgraph
176#define ISGRAPH(c) (ISASCII (c) && isgraph (c))
177#else
178#define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
179#endif
180
181#define ISPRINT(c) (ISASCII (c) && isprint (c))
182#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
183#define ISALNUM(c) (ISASCII (c) && isalnum (c))
184#define ISALPHA(c) (ISASCII (c) && isalpha (c))
185#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
186#define ISLOWER(c) (ISASCII (c) && islower (c))
187#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
188#define ISSPACE(c) (ISASCII (c) && isspace (c))
189#define ISUPPER(c) (ISASCII (c) && isupper (c))
190#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
191
192#ifndef NULL
193#define NULL 0
194#endif
195
196/* We remove any previous definition of `SIGN_EXTEND_CHAR',
197 since ours (we hope) works properly with all combinations of
198 machines, compilers, `char' and `unsigned char' argument types.
199 (Per Bothner suggested the basic approach.) */
200#undef SIGN_EXTEND_CHAR
201#if __STDC__
202#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
203#else /* not __STDC__ */
204/* As in Harbison and Steele. */
205#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
206#endif
207
208
209/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
210 use `alloca' instead of `malloc'. This is because using malloc in
211 re_search* or re_match* could cause memory leaks when C-g is used in
212 Emacs; also, malloc is slower and causes storage fragmentation. On
213 the other hand, malloc is more portable, and easier to debug.
214
215 Because we sometimes use alloca, some routines have to be macros,
216 not functions -- `alloca'-allocated space disappears at the end of the
217 function it is called in. */
218
219#ifdef REGEX_MALLOC
220
221#define REGEX_ALLOCATE malloc
222#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
223#define REGEX_FREE free
224
225#else /* not REGEX_MALLOC */
226
227/* Emacs already defines alloca, sometimes. */
228#ifndef alloca
229
230/* Make alloca work the best possible way. */
231#ifdef __GNUC__
232#define alloca __builtin_alloca
233#else /* not __GNUC__ */
234#if HAVE_ALLOCA_H
235#include <alloca.h>
236#else /* not __GNUC__ or HAVE_ALLOCA_H */
237#ifndef _AIX /* Already did AIX, up at the top. */
238char *alloca ();
239#endif /* not _AIX */
240#endif /* not HAVE_ALLOCA_H */
241#endif /* not __GNUC__ */
242
243#endif /* not alloca */
244
245#define REGEX_ALLOCATE alloca
246
247/* Assumes a `char *destination' variable. */
248#define REGEX_REALLOCATE(source, osize, nsize) \
249 (destination = (char *) alloca (nsize), \
250 bcopy (source, destination, osize), \
251 destination)
252
253/* No need to do anything to free, after alloca. */
254#define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
255
256#endif /* not REGEX_MALLOC */
257
258/* Define how to allocate the failure stack. */
259
260#ifdef REL_ALLOC
261#define REGEX_ALLOCATE_STACK(size) \
262 r_alloc (&failure_stack_ptr, (size))
263#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
264 r_re_alloc (&failure_stack_ptr, (nsize))
265#define REGEX_FREE_STACK(ptr) \
266 r_alloc_free (&failure_stack_ptr)
267
268#else /* not REL_ALLOC */
269
270#ifdef REGEX_MALLOC
271
272#define REGEX_ALLOCATE_STACK malloc
273#define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
274#define REGEX_FREE_STACK free
275
276#else /* not REGEX_MALLOC */
277
278#define REGEX_ALLOCATE_STACK alloca
279
280#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
281 REGEX_REALLOCATE (source, osize, nsize)
282/* No need to explicitly free anything. */
283#define REGEX_FREE_STACK(arg)
284
285#endif /* not REGEX_MALLOC */
286#endif /* not REL_ALLOC */
287
288
289/* True if `size1' is non-NULL and PTR is pointing anywhere inside
290 `string1' or just past its end. This works if PTR is NULL, which is
291 a good thing. */
292#define FIRST_STRING_P(ptr) \
293 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
294
295/* (Re)Allocate N items of type T using malloc, or fail. */
296#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
297#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
298#define RETALLOC_IF(addr, n, t) \
299 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
300#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
301
302#define BYTEWIDTH 8 /* In bits. */
303
304#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
305
306#undef MAX
307#undef MIN
308#define MAX(a, b) ((a) > (b) ? (a) : (b))
309#define MIN(a, b) ((a) < (b) ? (a) : (b))
310
311typedef char boolean;
312#define false 0
313#define true 1
314
315static int re_match_2_internal ();
316
317
318/* These are the command codes that appear in compiled regular
319 expressions. Some opcodes are followed by argument bytes. A
320 command code can specify any interpretation whatsoever for its
321 arguments. Zero bytes may appear in the compiled regular expression. */
322
323typedef enum
324{
325 no_op = 0,
326
327 /* Succeed right away--no more backtracking. */
328 succeed,
329
330 /* Followed by one byte giving n, then by n literal bytes. */
331 exactn,
332
333 /* Matches any (more or less) character. */
334 anychar,
335
336 /* Matches any one char belonging to specified set. First
337 following byte is number of bitmap bytes. Then come bytes
338 for a bitmap saying which chars are in. Bits in each byte
339 are ordered low-bit-first. A character is in the set if its
340 bit is 1. A character too large to have a bit in the map is
341 automatically not in the set. */
342 charset,
343
344 /* Same parameters as charset, but match any character that is
345 not one of those specified. */
346 charset_not,
347
348 /* Start remembering the text that is matched, for storing in a
349 register. Followed by one byte with the register number, in
350 the range 0 to one less than the pattern buffer's re_nsub
351 field. Then followed by one byte with the number of groups
352 inner to this one. (This last has to be part of the
353 start_memory only because we need it in the on_failure_jump
354 of re_match_2.) */
355 start_memory,
356
357 /* Stop remembering the text that is matched and store it in a
358 memory register. Followed by one byte with the register
359 number, in the range 0 to one less than `re_nsub' in the
360 pattern buffer, and one byte with the number of inner groups,
361 just like `start_memory'. (We need the number of inner
362 groups here because we don't have any easy way of finding the
363 corresponding start_memory when we're at a stop_memory.) */
364 stop_memory,
365
366 /* Match a duplicate of something remembered. Followed by one
367 byte containing the register number. */
368 duplicate,
369
370 /* Fail unless at beginning of line. */
371 begline,
372
373 /* Fail unless at end of line. */
374 endline,
375
376 /* Succeeds if at beginning of buffer (if emacs) or at beginning
377 of string to be matched (if not). */
378 begbuf,
379
380 /* Analogously, for end of buffer/string. */
381 endbuf,
382
383 /* Followed by two byte relative address to which to jump. */
384 jump,
385
386 /* Same as jump, but marks the end of an alternative. */
387 jump_past_alt,
388
389 /* Followed by two-byte relative address of place to resume at
390 in case of failure. */
391 on_failure_jump,
392
393 /* Like on_failure_jump, but pushes a placeholder instead of the
394 current string position when executed. */
395 on_failure_keep_string_jump,
396
397 /* Throw away latest failure point and then jump to following
398 two-byte relative address. */
399 pop_failure_jump,
400
401 /* Change to pop_failure_jump if know won't have to backtrack to
402 match; otherwise change to jump. This is used to jump
403 back to the beginning of a repeat. If what follows this jump
404 clearly won't match what the repeat does, such that we can be
405 sure that there is no use backtracking out of repetitions
406 already matched, then we change it to a pop_failure_jump.
407 Followed by two-byte address. */
408 maybe_pop_jump,
409
410 /* Jump to following two-byte address, and push a dummy failure
411 point. This failure point will be thrown away if an attempt
412 is made to use it for a failure. A `+' construct makes this
413 before the first repeat. Also used as an intermediary kind
414 of jump when compiling an alternative. */
415 dummy_failure_jump,
416
417 /* Push a dummy failure point and continue. Used at the end of
418 alternatives. */
419 push_dummy_failure,
420
421 /* Followed by two-byte relative address and two-byte number n.
422 After matching N times, jump to the address upon failure. */
423 succeed_n,
424
425 /* Followed by two-byte relative address, and two-byte number n.
426 Jump to the address N times, then fail. */
427 jump_n,
428
429 /* Set the following two-byte relative address to the
430 subsequent two-byte number. The address *includes* the two
431 bytes of number. */
432 set_number_at,
433
434 wordchar, /* Matches any word-constituent character. */
435 notwordchar, /* Matches any char that is not a word-constituent. */
436
437 wordbeg, /* Succeeds if at word beginning. */
438 wordend, /* Succeeds if at word end. */
439
440 wordbound, /* Succeeds if at a word boundary. */
441 notwordbound /* Succeeds if not at a word boundary. */
442
443#ifdef emacs
444 ,before_dot, /* Succeeds if before point. */
445 at_dot, /* Succeeds if at point. */
446 after_dot, /* Succeeds if after point. */
447
448 /* Matches any character whose syntax is specified. Followed by
449 a byte which contains a syntax code, e.g., Sword. */
450 syntaxspec,
451
452 /* Matches any character whose syntax is not that specified. */
453 notsyntaxspec
454#endif /* emacs */
455} re_opcode_t;
456
457
458/* Common operations on the compiled pattern. */
459
460/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
461
462#define STORE_NUMBER(destination, number) \
463 do { \
464 (destination)[0] = (number) & 0377; \
465 (destination)[1] = (number) >> 8; \
466 } while (0)
467
468/* Same as STORE_NUMBER, except increment DESTINATION to
469 the byte after where the number is stored. Therefore, DESTINATION
470 must be an lvalue. */
471
472#define STORE_NUMBER_AND_INCR(destination, number) \
473 do { \
474 STORE_NUMBER (destination, number); \
475 (destination) += 2; \
476 } while (0)
477
478/* Put into DESTINATION a number stored in two contiguous bytes starting
479 at SOURCE. */
480
481#define EXTRACT_NUMBER(destination, source) \
482 do { \
483 (destination) = *(source) & 0377; \
484 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
485 } while (0)
486
487#ifdef DEBUG
488static void
489extract_number (dest, source)
490 int *dest;
491 unsigned char *source;
492{
493 int temp = SIGN_EXTEND_CHAR (*(source + 1));
494 *dest = *source & 0377;
495 *dest += temp << 8;
496}
497
498#ifndef EXTRACT_MACROS /* To debug the macros. */
499#undef EXTRACT_NUMBER
500#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
501#endif /* not EXTRACT_MACROS */
502
503#endif /* DEBUG */
504
505/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
506 SOURCE must be an lvalue. */
507
508#define EXTRACT_NUMBER_AND_INCR(destination, source) \
509 do { \
510 EXTRACT_NUMBER (destination, source); \
511 (source) += 2; \
512 } while (0)
513
514#ifdef DEBUG
515static void
516extract_number_and_incr (destination, source)
517 int *destination;
518 unsigned char **source;
519{
520 extract_number (destination, *source);
521 *source += 2;
522}
523
524#ifndef EXTRACT_MACROS
525#undef EXTRACT_NUMBER_AND_INCR
526#define EXTRACT_NUMBER_AND_INCR(dest, src) \
527 extract_number_and_incr (&dest, &src)
528#endif /* not EXTRACT_MACROS */
529
530#endif /* DEBUG */
531
532
533/* If DEBUG is defined, Regex prints many voluminous messages about what
534 it is doing (if the variable `debug' is nonzero). If linked with the
535 main program in `iregex.c', you can enter patterns and strings
536 interactively. And if linked with the main program in `main.c' and
537 the other test files, you can run the already-written tests. */
538
539#ifdef DEBUG
540
541/* We use standard I/O for debugging. */
542#include <stdio.h>
543
544/* It is useful to test things that ``must'' be true when debugging. */
545#include <assert.h>
546
547static int debug = 0;
548
549#define DEBUG_STATEMENT(e) e
550#define DEBUG_PRINT1(x) if (debug) printf (x)
551#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
552#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
553#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
554#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
555 if (debug) print_partial_compiled_pattern (s, e)
556#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
557 if (debug) print_double_string (w, s1, sz1, s2, sz2)
558
559
560/* Print the fastmap in human-readable form. */
561
562void
563print_fastmap (fastmap)
564 char *fastmap;
565{
566 unsigned was_a_range = 0;
567 unsigned i = 0;
568
569 while (i < (1 << BYTEWIDTH))
570 {
571 if (fastmap[i++])
572 {
573 was_a_range = 0;
574 putchar (i - 1);
575 while (i < (1 << BYTEWIDTH) && fastmap[i])
576 {
577 was_a_range = 1;
578 i++;
579 }
580 if (was_a_range)
581 {
582 printf ("-");
583 putchar (i - 1);
584 }
585 }
586 }
587 putchar ('\n');
588}
589
590
591/* Print a compiled pattern string in human-readable form, starting at
592 the START pointer into it and ending just before the pointer END. */
593
594void
595print_partial_compiled_pattern (start, end)
596 unsigned char *start;
597 unsigned char *end;
598{
599 int mcnt, mcnt2;
600 unsigned char *p = start;
601 unsigned char *pend = end;
602
603 if (start == NULL)
604 {
605 printf ("(null)\n");
606 return;
607 }
608
609 /* Loop over pattern commands. */
610 while (p < pend)
611 {
612 printf ("%d:\t", p - start);
613
614 switch ((re_opcode_t) *p++)
615 {
616 case no_op:
617 printf ("/no_op");
618 break;
619
620 case exactn:
621 mcnt = *p++;
622 printf ("/exactn/%d", mcnt);
623 do
624 {
625 putchar ('/');
626 putchar (*p++);
627 }
628 while (--mcnt);
629 break;
630
631 case start_memory:
632 mcnt = *p++;
633 printf ("/start_memory/%d/%d", mcnt, *p++);
634 break;
635
636 case stop_memory:
637 mcnt = *p++;
638 printf ("/stop_memory/%d/%d", mcnt, *p++);
639 break;
640
641 case duplicate:
642 printf ("/duplicate/%d", *p++);
643 break;
644
645 case anychar:
646 printf ("/anychar");
647 break;
648
649 case charset:
650 case charset_not:
651 {
652 register int c, last = -100;
653 register int in_range = 0;
654
655 printf ("/charset [%s",
656 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
657
658 assert (p + *p < pend);
659
660 for (c = 0; c < 256; c++)
661 if (c / 8 < *p
662 && (p[1 + (c/8)] & (1 << (c % 8))))
663 {
664 /* Are we starting a range? */
665 if (last + 1 == c && ! in_range)
666 {
667 putchar ('-');
668 in_range = 1;
669 }
670 /* Have we broken a range? */
671 else if (last + 1 != c && in_range)
672 {
673 putchar (last);
674 in_range = 0;
675 }
676
677 if (! in_range)
678 putchar (c);
679
680 last = c;
681 }
682
683 if (in_range)
684 putchar (last);
685
686 putchar (']');
687
688 p += 1 + *p;
689 }
690 break;
691
692 case begline:
693 printf ("/begline");
694 break;
695
696 case endline:
697 printf ("/endline");
698 break;
699
700 case on_failure_jump:
701 extract_number_and_incr (&mcnt, &p);
702 printf ("/on_failure_jump to %d", p + mcnt - start);
703 break;
704
705 case on_failure_keep_string_jump:
706 extract_number_and_incr (&mcnt, &p);
707 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
708 break;
709
710 case dummy_failure_jump:
711 extract_number_and_incr (&mcnt, &p);
712 printf ("/dummy_failure_jump to %d", p + mcnt - start);
713 break;
714
715 case push_dummy_failure:
716 printf ("/push_dummy_failure");
717 break;
718
719 case maybe_pop_jump:
720 extract_number_and_incr (&mcnt, &p);
721 printf ("/maybe_pop_jump to %d", p + mcnt - start);
722 break;
723
724 case pop_failure_jump:
725 extract_number_and_incr (&mcnt, &p);
726 printf ("/pop_failure_jump to %d", p + mcnt - start);
727 break;
728
729 case jump_past_alt:
730 extract_number_and_incr (&mcnt, &p);
731 printf ("/jump_past_alt to %d", p + mcnt - start);
732 break;
733
734 case jump:
735 extract_number_and_incr (&mcnt, &p);
736 printf ("/jump to %d", p + mcnt - start);
737 break;
738
739 case succeed_n:
740 extract_number_and_incr (&mcnt, &p);
741 extract_number_and_incr (&mcnt2, &p);
742 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
743 break;
744
745 case jump_n:
746 extract_number_and_incr (&mcnt, &p);
747 extract_number_and_incr (&mcnt2, &p);
748 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
749 break;
750
751 case set_number_at:
752 extract_number_and_incr (&mcnt, &p);
753 extract_number_and_incr (&mcnt2, &p);
754 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
755 break;
756
757 case wordbound:
758 printf ("/wordbound");
759 break;
760
761 case notwordbound:
762 printf ("/notwordbound");
763 break;
764
765 case wordbeg:
766 printf ("/wordbeg");
767 break;
768
769 case wordend:
770 printf ("/wordend");
771
772#ifdef emacs
773 case before_dot:
774 printf ("/before_dot");
775 break;
776
777 case at_dot:
778 printf ("/at_dot");
779 break;
780
781 case after_dot:
782 printf ("/after_dot");
783 break;
784
785 case syntaxspec:
786 printf ("/syntaxspec");
787 mcnt = *p++;
788 printf ("/%d", mcnt);
789 break;
790
791 case notsyntaxspec:
792 printf ("/notsyntaxspec");
793 mcnt = *p++;
794 printf ("/%d", mcnt);
795 break;
796#endif /* emacs */
797
798 case wordchar:
799 printf ("/wordchar");
800 break;
801
802 case notwordchar:
803 printf ("/notwordchar");
804 break;
805
806 case begbuf:
807 printf ("/begbuf");
808 break;
809
810 case endbuf:
811 printf ("/endbuf");
812 break;
813
814 default:
815 printf ("?%d", *(p-1));
816 }
817
818 putchar ('\n');
819 }
820
821 printf ("%d:\tend of pattern.\n", p - start);
822}
823
824
825void
826print_compiled_pattern (bufp)
827 struct re_pattern_buffer *bufp;
828{
829 unsigned char *buffer = bufp->buffer;
830
831 print_partial_compiled_pattern (buffer, buffer + bufp->used);
832 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
833
834 if (bufp->fastmap_accurate && bufp->fastmap)
835 {
836 printf ("fastmap: ");
837 print_fastmap (bufp->fastmap);
838 }
839
840 printf ("re_nsub: %d\t", bufp->re_nsub);
841 printf ("regs_alloc: %d\t", bufp->regs_allocated);
842 printf ("can_be_null: %d\t", bufp->can_be_null);
843 printf ("newline_anchor: %d\n", bufp->newline_anchor);
844 printf ("no_sub: %d\t", bufp->no_sub);
845 printf ("not_bol: %d\t", bufp->not_bol);
846 printf ("not_eol: %d\t", bufp->not_eol);
847 printf ("syntax: %d\n", bufp->syntax);
848 /* Perhaps we should print the translate table? */
849}
850
851
852void
853print_double_string (where, string1, size1, string2, size2)
854 const char *where;
855 const char *string1;
856 const char *string2;
857 int size1;
858 int size2;
859{
860 unsigned this_char;
861
862 if (where == NULL)
863 printf ("(null)");
864 else
865 {
866 if (FIRST_STRING_P (where))
867 {
868 for (this_char = where - string1; this_char < size1; this_char++)
869 putchar (string1[this_char]);
870
871 where = string2;
872 }
873
874 for (this_char = where - string2; this_char < size2; this_char++)
875 putchar (string2[this_char]);
876 }
877}
878
879#else /* not DEBUG */
880
881#undef assert
882#define assert(e)
883
884#define DEBUG_STATEMENT(e)
885#define DEBUG_PRINT1(x)
886#define DEBUG_PRINT2(x1, x2)
887#define DEBUG_PRINT3(x1, x2, x3)
888#define DEBUG_PRINT4(x1, x2, x3, x4)
889#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
890#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
891
892#endif /* not DEBUG */
893
894
895/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
896 also be assigned to arbitrarily: each pattern buffer stores its own
897 syntax, so it can be changed between regex compilations. */
898/* This has no initializer because initialized variables in Emacs
899 become read-only after dumping. */
900reg_syntax_t re_syntax_options;
901
902
903/* Specify the precise syntax of regexps for compilation. This provides
904 for compatibility for various utilities which historically have
905 different, incompatible syntaxes.
906
907 The argument SYNTAX is a bit mask comprised of the various bits
908 defined in regex.h. We return the old syntax. */
909
910reg_syntax_t
911re_set_syntax (syntax)
912 reg_syntax_t syntax;
913{
914 reg_syntax_t ret = re_syntax_options;
915
916 re_syntax_options = syntax;
917 return ret;
918}
919
920
921/* This table gives an error message for each of the error codes listed
922 in regex.h. Obviously the order here has to be same as there.
923 POSIX doesn't require that we do anything for REG_NOERROR,
924 but why not be nice? */
925
926static const char *re_error_msgid[] =
927 { "Success", /* REG_NOERROR */
928 "No match", /* REG_NOMATCH */
929 "Invalid regular expression", /* REG_BADPAT */
930 "Invalid collation character", /* REG_ECOLLATE */
931 "Invalid character class name", /* REG_ECTYPE */
932 "Trailing backslash", /* REG_EESCAPE */
933 "Invalid back reference", /* REG_ESUBREG */
934 "Unmatched [ or [^", /* REG_EBRACK */
935 "Unmatched ( or \\(", /* REG_EPAREN */
936 "Unmatched \\{", /* REG_EBRACE */
937 "Invalid content of \\{\\}", /* REG_BADBR */
938 "Invalid range end", /* REG_ERANGE */
939 "Memory exhausted", /* REG_ESPACE */
940 "Invalid preceding regular expression", /* REG_BADRPT */
941 "Premature end of regular expression", /* REG_EEND */
942 "Regular expression too big", /* REG_ESIZE */
943 "Unmatched ) or \\)", /* REG_ERPAREN */
944 };
945
946
947/* Avoiding alloca during matching, to placate r_alloc. */
948
949/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
950 searching and matching functions should not call alloca. On some
951 systems, alloca is implemented in terms of malloc, and if we're
952 using the relocating allocator routines, then malloc could cause a
953 relocation, which might (if the strings being searched are in the
954 ralloc heap) shift the data out from underneath the regexp
955 routines.
956
957 Here's another reason to avoid allocation: Emacs
958 processes input from X in a signal handler; processing X input may
959 call malloc; if input arrives while a matching routine is calling
960 malloc, then we're scrod. But Emacs can't just block input while
961 calling matching routines; then we don't notice interrupts when
962 they come in. So, Emacs blocks input around all regexp calls
963 except the matching calls, which it leaves unprotected, in the
964 faith that they will not malloc. */
965
966/* Normally, this is fine. */
967#define MATCH_MAY_ALLOCATE
968
969/* When using GNU C, we are not REALLY using the C alloca, no matter
970 what sysfuncs.h may say. So don't take precautions for it. */
971#ifdef __GNUC__
972#undef C_ALLOCA
973#endif
974
975/* The match routines may not allocate if (1) they would do it with malloc
976 and (2) it's not safe for them to use malloc.
977 Note that if REL_ALLOC is defined, matching would not use malloc for the
978 failure stack, but we would still use it for the register vectors;
979 so REL_ALLOC should not affect this. */
980#if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
981#undef MATCH_MAY_ALLOCATE
982#endif
983
984
985
986/* Failure stack declarations and macros; both re_compile_fastmap and
987 re_match_2 use a failure stack. These have to be macros because of
988 REGEX_ALLOCATE_STACK. */
989
990
991/* Number of failure points for which to initially allocate space
992 when matching. If this number is exceeded, we allocate more
993 space, so it is not a hard limit. */
994#ifndef INIT_FAILURE_ALLOC
995#define INIT_FAILURE_ALLOC 5
996#endif
997
998/* Roughly the maximum number of failure points on the stack. Would be
999 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1000 This is a variable only so users of regex can assign to it; we never
1001 change it ourselves. */
1002#if defined (MATCH_MAY_ALLOCATE)
1003int re_max_failures = 200000;
1004#else
1005int re_max_failures = 2000;
1006#endif
1007
1008union fail_stack_elt
1009{
1010 unsigned char *pointer;
1011 int integer;
1012};
1013
1014typedef union fail_stack_elt fail_stack_elt_t;
1015
1016typedef struct
1017{
1018 fail_stack_elt_t *stack;
1019 unsigned size;
1020 unsigned avail; /* Offset of next open position. */
1021} fail_stack_type;
1022
1023#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1024#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1025#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1026
1027
1028/* Define macros to initialize and free the failure stack.
1029 Do `return -2' if the alloc fails. */
1030
1031#ifdef MATCH_MAY_ALLOCATE
1032#define INIT_FAIL_STACK() \
1033 do { \
1034 fail_stack.stack = (fail_stack_elt_t *) \
1035 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1036 \
1037 if (fail_stack.stack == NULL) \
1038 return -2; \
1039 \
1040 fail_stack.size = INIT_FAILURE_ALLOC; \
1041 fail_stack.avail = 0; \
1042 } while (0)
1043
1044#define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1045#else
1046#define INIT_FAIL_STACK() \
1047 do { \
1048 fail_stack.avail = 0; \
1049 } while (0)
1050
1051#define RESET_FAIL_STACK()
1052#endif
1053
1054
1055/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1056
1057 Return 1 if succeeds, and 0 if either ran out of memory
1058 allocating space for it or it was already too large.
1059
1060 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1061
1062#define DOUBLE_FAIL_STACK(fail_stack) \
1063 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1064 ? 0 \
1065 : ((fail_stack).stack = (fail_stack_elt_t *) \
1066 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1067 (fail_stack).size * sizeof (fail_stack_elt_t), \
1068 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1069 \
1070 (fail_stack).stack == NULL \
1071 ? 0 \
1072 : ((fail_stack).size <<= 1, \
1073 1)))
1074
1075
1076/* Push pointer POINTER on FAIL_STACK.
1077 Return 1 if was able to do so and 0 if ran out of memory allocating
1078 space to do so. */
1079#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1080 ((FAIL_STACK_FULL () \
1081 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1082 ? 0 \
1083 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1084 1))
1085
1086/* Push a pointer value onto the failure stack.
1087 Assumes the variable `fail_stack'. Probably should only
1088 be called from within `PUSH_FAILURE_POINT'. */
1089#define PUSH_FAILURE_POINTER(item) \
1090 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1091
1092/* This pushes an integer-valued item onto the failure stack.
1093 Assumes the variable `fail_stack'. Probably should only
1094 be called from within `PUSH_FAILURE_POINT'. */
1095#define PUSH_FAILURE_INT(item) \
1096 fail_stack.stack[fail_stack.avail++].integer = (item)
1097
1098/* Push a fail_stack_elt_t value onto the failure stack.
1099 Assumes the variable `fail_stack'. Probably should only
1100 be called from within `PUSH_FAILURE_POINT'. */
1101#define PUSH_FAILURE_ELT(item) \
1102 fail_stack.stack[fail_stack.avail++] = (item)
1103
1104/* These three POP... operations complement the three PUSH... operations.
1105 All assume that `fail_stack' is nonempty. */
1106#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1107#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1108#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1109
1110/* Used to omit pushing failure point id's when we're not debugging. */
1111#ifdef DEBUG
1112#define DEBUG_PUSH PUSH_FAILURE_INT
1113#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1114#else
1115#define DEBUG_PUSH(item)
1116#define DEBUG_POP(item_addr)
1117#endif
1118
1119
1120/* Push the information about the state we will need
1121 if we ever fail back to it.
1122
1123 Requires variables fail_stack, regstart, regend, reg_info, and
1124 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1125 declared.
1126
1127 Does `return FAILURE_CODE' if runs out of memory. */
1128
1129#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1130 do { \
1131 char *destination; \
1132 /* Must be int, so when we don't save any registers, the arithmetic \
1133 of 0 + -1 isn't done as unsigned. */ \
1134 int this_reg; \
1135 \
1136 DEBUG_STATEMENT (failure_id++); \
1137 DEBUG_STATEMENT (nfailure_points_pushed++); \
1138 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1139 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1140 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1141 \
1142 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1143 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1144 \
1145 /* Ensure we have enough space allocated for what we will push. */ \
1146 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1147 { \
1148 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1149 return failure_code; \
1150 \
1151 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1152 (fail_stack).size); \
1153 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1154 } \
1155 \
1156 /* Push the info, starting with the registers. */ \
1157 DEBUG_PRINT1 ("\n"); \
1158 \
1159 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1160 this_reg++) \
1161 { \
1162 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1163 DEBUG_STATEMENT (num_regs_pushed++); \
1164 \
1165 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1166 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1167 \
1168 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1169 PUSH_FAILURE_POINTER (regend[this_reg]); \
1170 \
1171 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1172 DEBUG_PRINT2 (" match_null=%d", \
1173 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1174 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1175 DEBUG_PRINT2 (" matched_something=%d", \
1176 MATCHED_SOMETHING (reg_info[this_reg])); \
1177 DEBUG_PRINT2 (" ever_matched=%d", \
1178 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1179 DEBUG_PRINT1 ("\n"); \
1180 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1181 } \
1182 \
1183 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1184 PUSH_FAILURE_INT (lowest_active_reg); \
1185 \
1186 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1187 PUSH_FAILURE_INT (highest_active_reg); \
1188 \
1189 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
1190 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1191 PUSH_FAILURE_POINTER (pattern_place); \
1192 \
1193 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
1194 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1195 size2); \
1196 DEBUG_PRINT1 ("'\n"); \
1197 PUSH_FAILURE_POINTER (string_place); \
1198 \
1199 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1200 DEBUG_PUSH (failure_id); \
1201 } while (0)
1202
1203/* This is the number of items that are pushed and popped on the stack
1204 for each register. */
1205#define NUM_REG_ITEMS 3
1206
1207/* Individual items aside from the registers. */
1208#ifdef DEBUG
1209#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1210#else
1211#define NUM_NONREG_ITEMS 4
1212#endif
1213
1214/* We push at most this many items on the stack. */
1215#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1216
1217/* We actually push this many items. */
1218#define NUM_FAILURE_ITEMS \
1219 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1220 + NUM_NONREG_ITEMS)
1221
1222/* How many items can still be added to the stack without overflowing it. */
1223#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1224
1225
1226/* Pops what PUSH_FAIL_STACK pushes.
1227
1228 We restore into the parameters, all of which should be lvalues:
1229 STR -- the saved data position.
1230 PAT -- the saved pattern position.
1231 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1232 REGSTART, REGEND -- arrays of string positions.
1233 REG_INFO -- array of information about each subexpression.
1234
1235 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1236 `pend', `string1', `size1', `string2', and `size2'. */
1237
1238#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1239{ \
1240 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1241 int this_reg; \
1242 const unsigned char *string_temp; \
1243 \
1244 assert (!FAIL_STACK_EMPTY ()); \
1245 \
1246 /* Remove failure points and point to how many regs pushed. */ \
1247 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1248 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1249 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1250 \
1251 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1252 \
1253 DEBUG_POP (&failure_id); \
1254 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1255 \
1256 /* If the saved string location is NULL, it came from an \
1257 on_failure_keep_string_jump opcode, and we want to throw away the \
1258 saved NULL, thus retaining our current position in the string. */ \
1259 string_temp = POP_FAILURE_POINTER (); \
1260 if (string_temp != NULL) \
1261 str = (const char *) string_temp; \
1262 \
1263 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1264 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1265 DEBUG_PRINT1 ("'\n"); \
1266 \
1267 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1268 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
1269 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1270 \
1271 /* Restore register info. */ \
1272 high_reg = (unsigned) POP_FAILURE_INT (); \
1273 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1274 \
1275 low_reg = (unsigned) POP_FAILURE_INT (); \
1276 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1277 \
1278 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1279 { \
1280 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1281 \
1282 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1283 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
1284 \
1285 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1286 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1287 \
1288 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1289 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1290 } \
1291 \
1292 set_regs_matched_done = 0; \
1293 DEBUG_STATEMENT (nfailure_points_popped++); \
1294} /* POP_FAILURE_POINT */
1295
1296
1297
1298
1299/* Structure for per-register (a.k.a. per-group) information.
1300 Other register information, such as the
1301 starting and ending positions (which are addresses), and the list of
1302 inner groups (which is a bits list) are maintained in separate
1303 variables.
1304
1305 We are making a (strictly speaking) nonportable assumption here: that
1306 the compiler will pack our bit fields into something that fits into
1307 the type of `word', i.e., is something that fits into one item on the
1308 failure stack. */
1309
1310typedef union
1311{
1312 fail_stack_elt_t word;
1313 struct
1314 {
1315 /* This field is one if this group can match the empty string,
1316 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1317#define MATCH_NULL_UNSET_VALUE 3
1318 unsigned match_null_string_p : 2;
1319 unsigned is_active : 1;
1320 unsigned matched_something : 1;
1321 unsigned ever_matched_something : 1;
1322 } bits;
1323} register_info_type;
1324
1325#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1326#define IS_ACTIVE(R) ((R).bits.is_active)
1327#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1328#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1329
1330
1331/* Call this when have matched a real character; it sets `matched' flags
1332 for the subexpressions which we are currently inside. Also records
1333 that those subexprs have matched. */
1334#define SET_REGS_MATCHED() \
1335 do \
1336 { \
1337 if (!set_regs_matched_done) \
1338 { \
1339 unsigned r; \
1340 set_regs_matched_done = 1; \
1341 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1342 { \
1343 MATCHED_SOMETHING (reg_info[r]) \
1344 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1345 = 1; \
1346 } \
1347 } \
1348 } \
1349 while (0)
1350
1351/* Registers are set to a sentinel when they haven't yet matched. */
1352static char reg_unset_dummy;
1353#define REG_UNSET_VALUE (&reg_unset_dummy)
1354#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1355
1356
1357/* Subroutine declarations and macros for regex_compile. */
1358
1359static void store_op1 (), store_op2 ();
1360static void insert_op1 (), insert_op2 ();
1361static boolean at_begline_loc_p (), at_endline_loc_p ();
1362static boolean group_in_compile_stack ();
1363static reg_errcode_t compile_range ();
1364
1365/* Fetch the next character in the uncompiled pattern---translating it
1366 if necessary. Also cast from a signed character in the constant
1367 string passed to us by the user to an unsigned char that we can use
1368 as an array index (in, e.g., `translate'). */
1369#define PATFETCH(c) \
1370 do {if (p == pend) return REG_EEND; \
1371 c = (unsigned char) *p++; \
1372 if (translate) c = translate[c]; \
1373 } while (0)
1374
1375/* Fetch the next character in the uncompiled pattern, with no
1376 translation. */
1377#define PATFETCH_RAW(c) \
1378 do {if (p == pend) return REG_EEND; \
1379 c = (unsigned char) *p++; \
1380 } while (0)
1381
1382/* Go backwards one character in the pattern. */
1383#define PATUNFETCH p--
1384
1385
1386/* If `translate' is non-null, return translate[D], else just D. We
1387 cast the subscript to translate because some data is declared as
1388 `char *', to avoid warnings when a string constant is passed. But
1389 when we use a character as a subscript we must make it unsigned. */
1390#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
1391
1392
1393/* Macros for outputting the compiled pattern into `buffer'. */
1394
1395/* If the buffer isn't allocated when it comes in, use this. */
1396#define INIT_BUF_SIZE 32
1397
1398/* Make sure we have at least N more bytes of space in buffer. */
1399#define GET_BUFFER_SPACE(n) \
1400 while (b - bufp->buffer + (n) > bufp->allocated) \
1401 EXTEND_BUFFER ()
1402
1403/* Make sure we have one more byte of buffer space and then add C to it. */
1404#define BUF_PUSH(c) \
1405 do { \
1406 GET_BUFFER_SPACE (1); \
1407 *b++ = (unsigned char) (c); \
1408 } while (0)
1409
1410
1411/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1412#define BUF_PUSH_2(c1, c2) \
1413 do { \
1414 GET_BUFFER_SPACE (2); \
1415 *b++ = (unsigned char) (c1); \
1416 *b++ = (unsigned char) (c2); \
1417 } while (0)
1418
1419
1420/* As with BUF_PUSH_2, except for three bytes. */
1421#define BUF_PUSH_3(c1, c2, c3) \
1422 do { \
1423 GET_BUFFER_SPACE (3); \
1424 *b++ = (unsigned char) (c1); \
1425 *b++ = (unsigned char) (c2); \
1426 *b++ = (unsigned char) (c3); \
1427 } while (0)
1428
1429
1430/* Store a jump with opcode OP at LOC to location TO. We store a
1431 relative address offset by the three bytes the jump itself occupies. */
1432#define STORE_JUMP(op, loc, to) \
1433 store_op1 (op, loc, (to) - (loc) - 3)
1434
1435/* Likewise, for a two-argument jump. */
1436#define STORE_JUMP2(op, loc, to, arg) \
1437 store_op2 (op, loc, (to) - (loc) - 3, arg)
1438
1439/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1440#define INSERT_JUMP(op, loc, to) \
1441 insert_op1 (op, loc, (to) - (loc) - 3, b)
1442
1443/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1444#define INSERT_JUMP2(op, loc, to, arg) \
1445 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1446
1447
1448/* This is not an arbitrary limit: the arguments which represent offsets
1449 into the pattern are two bytes long. So if 2^16 bytes turns out to
1450 be too small, many things would have to change. */
1451#define MAX_BUF_SIZE (1L << 16)
1452
1453
1454/* Extend the buffer by twice its current size via realloc and
1455 reset the pointers that pointed into the old block to point to the
1456 correct places in the new one. If extending the buffer results in it
1457 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1458#define EXTEND_BUFFER() \
1459 do { \
1460 unsigned char *old_buffer = bufp->buffer; \
1461 if (bufp->allocated == MAX_BUF_SIZE) \
1462 return REG_ESIZE; \
1463 bufp->allocated <<= 1; \
1464 if (bufp->allocated > MAX_BUF_SIZE) \
1465 bufp->allocated = MAX_BUF_SIZE; \
1466 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1467 if (bufp->buffer == NULL) \
1468 return REG_ESPACE; \
1469 /* If the buffer moved, move all the pointers into it. */ \
1470 if (old_buffer != bufp->buffer) \
1471 { \
1472 b = (b - old_buffer) + bufp->buffer; \
1473 begalt = (begalt - old_buffer) + bufp->buffer; \
1474 if (fixup_alt_jump) \
1475 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1476 if (laststart) \
1477 laststart = (laststart - old_buffer) + bufp->buffer; \
1478 if (pending_exact) \
1479 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1480 } \
1481 } while (0)
1482
1483
1484/* Since we have one byte reserved for the register number argument to
1485 {start,stop}_memory, the maximum number of groups we can report
1486 things about is what fits in that byte. */
1487#define MAX_REGNUM 255
1488
1489/* But patterns can have more than `MAX_REGNUM' registers. We just
1490 ignore the excess. */
1491typedef unsigned regnum_t;
1492
1493
1494/* Macros for the compile stack. */
1495
1496/* Since offsets can go either forwards or backwards, this type needs to
1497 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1498typedef int pattern_offset_t;
1499
1500typedef struct
1501{
1502 pattern_offset_t begalt_offset;
1503 pattern_offset_t fixup_alt_jump;
1504 pattern_offset_t inner_group_offset;
1505 pattern_offset_t laststart_offset;
1506 regnum_t regnum;
1507} compile_stack_elt_t;
1508
1509
1510typedef struct
1511{
1512 compile_stack_elt_t *stack;
1513 unsigned size;
1514 unsigned avail; /* Offset of next open position. */
1515} compile_stack_type;
1516
1517
1518#define INIT_COMPILE_STACK_SIZE 32
1519
1520#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1521#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1522
1523/* The next available element. */
1524#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1525
1526
1527/* Set the bit for character C in a list. */
1528#define SET_LIST_BIT(c) \
1529 (b[((unsigned char) (c)) / BYTEWIDTH] \
1530 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1531
1532
1533/* Get the next unsigned number in the uncompiled pattern. */
1534#define GET_UNSIGNED_NUMBER(num) \
1535 { if (p != pend) \
1536 { \
1537 PATFETCH (c); \
1538 while (ISDIGIT (c)) \
1539 { \
1540 if (num < 0) \
1541 num = 0; \
1542 num = num * 10 + c - '0'; \
1543 if (p == pend) \
1544 break; \
1545 PATFETCH (c); \
1546 } \
1547 } \
1548 }
1549
1550#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1551
1552#define IS_CHAR_CLASS(string) \
1553 (STREQ (string, "alpha") || STREQ (string, "upper") \
1554 || STREQ (string, "lower") || STREQ (string, "digit") \
1555 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1556 || STREQ (string, "space") || STREQ (string, "print") \
1557 || STREQ (string, "punct") || STREQ (string, "graph") \
1558 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1559
1560
1561#ifndef MATCH_MAY_ALLOCATE
1562
1563/* If we cannot allocate large objects within re_match_2_internal,
1564 we make the fail stack and register vectors global.
1565 The fail stack, we grow to the maximum size when a regexp
1566 is compiled.
1567 The register vectors, we adjust in size each time we
1568 compile a regexp, according to the number of registers it needs. */
1569
1570static fail_stack_type fail_stack;
1571
1572/* Size with which the following vectors are currently allocated.
1573 That is so we can make them bigger as needed,
1574 but never make them smaller. */
1575static int regs_allocated_size;
1576
1577static const char ** regstart, ** regend;
1578static const char ** old_regstart, ** old_regend;
1579static const char **best_regstart, **best_regend;
1580static register_info_type *reg_info;
1581static const char **reg_dummy;
1582static register_info_type *reg_info_dummy;
1583
1584/* Make the register vectors big enough for NUM_REGS registers,
1585 but don't make them smaller. */
1586
1587static
1588regex_grow_registers (num_regs)
1589 int num_regs;
1590{
1591 if (num_regs > regs_allocated_size)
1592 {
1593 RETALLOC_IF (regstart, num_regs, const char *);
1594 RETALLOC_IF (regend, num_regs, const char *);
1595 RETALLOC_IF (old_regstart, num_regs, const char *);
1596 RETALLOC_IF (old_regend, num_regs, const char *);
1597 RETALLOC_IF (best_regstart, num_regs, const char *);
1598 RETALLOC_IF (best_regend, num_regs, const char *);
1599 RETALLOC_IF (reg_info, num_regs, register_info_type);
1600 RETALLOC_IF (reg_dummy, num_regs, const char *);
1601 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1602
1603 regs_allocated_size = num_regs;
1604 }
1605}
1606
1607#endif /* not MATCH_MAY_ALLOCATE */
1608
1609
1610/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1611 Returns one of error codes defined in `regex.h', or zero for success.
1612
1613 Assumes the `allocated' (and perhaps `buffer') and `translate'
1614 fields are set in BUFP on entry.
1615
1616 If it succeeds, results are put in BUFP (if it returns an error, the
1617 contents of BUFP are undefined):
1618 `buffer' is the compiled pattern;
1619 `syntax' is set to SYNTAX;
1620 `used' is set to the length of the compiled pattern;
1621 `fastmap_accurate' is zero;
1622 `re_nsub' is the number of subexpressions in PATTERN;
1623 `not_bol' and `not_eol' are zero;
1624
1625 The `fastmap' and `newline_anchor' fields are neither
1626 examined nor set. */
1627
1628/* Return, freeing storage we allocated. */
1629#define FREE_STACK_RETURN(value) \
1630 return (free (compile_stack.stack), value)
1631
1632static reg_errcode_t
1633regex_compile (pattern, size, syntax, bufp)
1634 const char *pattern;
1635 int size;
1636 reg_syntax_t syntax;
1637 struct re_pattern_buffer *bufp;
1638{
1639 /* We fetch characters from PATTERN here. Even though PATTERN is
1640 `char *' (i.e., signed), we declare these variables as unsigned, so
1641 they can be reliably used as array indices. */
1642 register unsigned char c, c1;
1643
1644 /* A random temporary spot in PATTERN. */
1645 const char *p1;
1646
1647 /* Points to the end of the buffer, where we should append. */
1648 register unsigned char *b;
1649
1650 /* Keeps track of unclosed groups. */
1651 compile_stack_type compile_stack;
1652
1653 /* Points to the current (ending) position in the pattern. */
1654 const char *p = pattern;
1655 const char *pend = pattern + size;
1656
1657 /* How to translate the characters in the pattern. */
1658 char *translate = bufp->translate;
1659
1660 /* Address of the count-byte of the most recently inserted `exactn'
1661 command. This makes it possible to tell if a new exact-match
1662 character can be added to that command or if the character requires
1663 a new `exactn' command. */
1664 unsigned char *pending_exact = 0;
1665
1666 /* Address of start of the most recently finished expression.
1667 This tells, e.g., postfix * where to find the start of its
1668 operand. Reset at the beginning of groups and alternatives. */
1669 unsigned char *laststart = 0;
1670
1671 /* Address of beginning of regexp, or inside of last group. */
1672 unsigned char *begalt;
1673
1674 /* Place in the uncompiled pattern (i.e., the {) to
1675 which to go back if the interval is invalid. */
1676 const char *beg_interval;
1677
1678 /* Address of the place where a forward jump should go to the end of
1679 the containing expression. Each alternative of an `or' -- except the
1680 last -- ends with a forward jump of this sort. */
1681 unsigned char *fixup_alt_jump = 0;
1682
1683 /* Counts open-groups as they are encountered. Remembered for the
1684 matching close-group on the compile stack, so the same register
1685 number is put in the stop_memory as the start_memory. */
1686 regnum_t regnum = 0;
1687
1688#ifdef DEBUG
1689 DEBUG_PRINT1 ("\nCompiling pattern: ");
1690 if (debug)
1691 {
1692 unsigned debug_count;
1693
1694 for (debug_count = 0; debug_count < size; debug_count++)
1695 putchar (pattern[debug_count]);
1696 putchar ('\n');
1697 }
1698#endif /* DEBUG */
1699
1700 /* Initialize the compile stack. */
1701 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1702 if (compile_stack.stack == NULL)
1703 return REG_ESPACE;
1704
1705 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1706 compile_stack.avail = 0;
1707
1708 /* Initialize the pattern buffer. */
1709 bufp->syntax = syntax;
1710 bufp->fastmap_accurate = 0;
1711 bufp->not_bol = bufp->not_eol = 0;
1712
1713 /* Set `used' to zero, so that if we return an error, the pattern
1714 printer (for debugging) will think there's no pattern. We reset it
1715 at the end. */
1716 bufp->used = 0;
1717
1718 /* Always count groups, whether or not bufp->no_sub is set. */
1719 bufp->re_nsub = 0;
1720
1721#if !defined (emacs) && !defined (SYNTAX_TABLE)
1722 /* Initialize the syntax table. */
1723 init_syntax_once ();
1724#endif
1725
1726 if (bufp->allocated == 0)
1727 {
1728 if (bufp->buffer)
1729 { /* If zero allocated, but buffer is non-null, try to realloc
1730 enough space. This loses if buffer's address is bogus, but
1731 that is the user's responsibility. */
1732 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1733 }
1734 else
1735 { /* Caller did not allocate a buffer. Do it for them. */
1736 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1737 }
1738 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1739
1740 bufp->allocated = INIT_BUF_SIZE;
1741 }
1742
1743 begalt = b = bufp->buffer;
1744
1745 /* Loop through the uncompiled pattern until we're at the end. */
1746 while (p != pend)
1747 {
1748 PATFETCH (c);
1749
1750 switch (c)
1751 {
1752 case '^':
1753 {
1754 if ( /* If at start of pattern, it's an operator. */
1755 p == pattern + 1
1756 /* If context independent, it's an operator. */
1757 || syntax & RE_CONTEXT_INDEP_ANCHORS
1758 /* Otherwise, depends on what's come before. */
1759 || at_begline_loc_p (pattern, p, syntax))
1760 BUF_PUSH (begline);
1761 else
1762 goto normal_char;
1763 }
1764 break;
1765
1766
1767 case '$':
1768 {
1769 if ( /* If at end of pattern, it's an operator. */
1770 p == pend
1771 /* If context independent, it's an operator. */
1772 || syntax & RE_CONTEXT_INDEP_ANCHORS
1773 /* Otherwise, depends on what's next. */
1774 || at_endline_loc_p (p, pend, syntax))
1775 BUF_PUSH (endline);
1776 else
1777 goto normal_char;
1778 }
1779 break;
1780
1781
1782 case '+':
1783 case '?':
1784 if ((syntax & RE_BK_PLUS_QM)
1785 || (syntax & RE_LIMITED_OPS))
1786 goto normal_char;
1787 handle_plus:
1788 case '*':
1789 /* If there is no previous pattern... */
1790 if (!laststart)
1791 {
1792 if (syntax & RE_CONTEXT_INVALID_OPS)
1793 FREE_STACK_RETURN (REG_BADRPT);
1794 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1795 goto normal_char;
1796 }
1797
1798 {
1799 /* Are we optimizing this jump? */
1800 boolean keep_string_p = false;
1801
1802 /* 1 means zero (many) matches is allowed. */
1803 char zero_times_ok = 0, many_times_ok = 0;
1804
1805 /* If there is a sequence of repetition chars, collapse it
1806 down to just one (the right one). We can't combine
1807 interval operators with these because of, e.g., `a{2}*',
1808 which should only match an even number of `a's. */
1809
1810 for (;;)
1811 {
1812 zero_times_ok |= c != '+';
1813 many_times_ok |= c != '?';
1814
1815 if (p == pend)
1816 break;
1817
1818 PATFETCH (c);
1819
1820 if (c == '*'
1821 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1822 ;
1823
1824 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1825 {
1826 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1827
1828 PATFETCH (c1);
1829 if (!(c1 == '+' || c1 == '?'))
1830 {
1831 PATUNFETCH;
1832 PATUNFETCH;
1833 break;
1834 }
1835
1836 c = c1;
1837 }
1838 else
1839 {
1840 PATUNFETCH;
1841 break;
1842 }
1843
1844 /* If we get here, we found another repeat character. */
1845 }
1846
1847 /* Star, etc. applied to an empty pattern is equivalent
1848 to an empty pattern. */
1849 if (!laststart)
1850 break;
1851
1852 /* Now we know whether or not zero matches is allowed
1853 and also whether or not two or more matches is allowed. */
1854 if (many_times_ok)
1855 { /* More than one repetition is allowed, so put in at the
1856 end a backward relative jump from `b' to before the next
1857 jump we're going to put in below (which jumps from
1858 laststart to after this jump).
1859
1860 But if we are at the `*' in the exact sequence `.*\n',
1861 insert an unconditional jump backwards to the .,
1862 instead of the beginning of the loop. This way we only
1863 push a failure point once, instead of every time
1864 through the loop. */
1865 assert (p - 1 > pattern);
1866
1867 /* Allocate the space for the jump. */
1868 GET_BUFFER_SPACE (3);
1869
1870 /* We know we are not at the first character of the pattern,
1871 because laststart was nonzero. And we've already
1872 incremented `p', by the way, to be the character after
1873 the `*'. Do we have to do something analogous here
1874 for null bytes, because of RE_DOT_NOT_NULL? */
1875 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1876 && zero_times_ok
1877 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1878 && !(syntax & RE_DOT_NEWLINE))
1879 { /* We have .*\n. */
1880 STORE_JUMP (jump, b, laststart);
1881 keep_string_p = true;
1882 }
1883 else
1884 /* Anything else. */
1885 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1886
1887 /* We've added more stuff to the buffer. */
1888 b += 3;
1889 }
1890
1891 /* On failure, jump from laststart to b + 3, which will be the
1892 end of the buffer after this jump is inserted. */
1893 GET_BUFFER_SPACE (3);
1894 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1895 : on_failure_jump,
1896 laststart, b + 3);
1897 pending_exact = 0;
1898 b += 3;
1899
1900 if (!zero_times_ok)
1901 {
1902 /* At least one repetition is required, so insert a
1903 `dummy_failure_jump' before the initial
1904 `on_failure_jump' instruction of the loop. This
1905 effects a skip over that instruction the first time
1906 we hit that loop. */
1907 GET_BUFFER_SPACE (3);
1908 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1909 b += 3;
1910 }
1911 }
1912 break;
1913
1914
1915 case '.':
1916 laststart = b;
1917 BUF_PUSH (anychar);
1918 break;
1919
1920
1921 case '[':
1922 {
1923 boolean had_char_class = false;
1924
1925 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1926
1927 /* Ensure that we have enough space to push a charset: the
1928 opcode, the length count, and the bitset; 34 bytes in all. */
1929 GET_BUFFER_SPACE (34);
1930
1931 laststart = b;
1932
1933 /* We test `*p == '^' twice, instead of using an if
1934 statement, so we only need one BUF_PUSH. */
1935 BUF_PUSH (*p == '^' ? charset_not : charset);
1936 if (*p == '^')
1937 p++;
1938
1939 /* Remember the first position in the bracket expression. */
1940 p1 = p;
1941
1942 /* Push the number of bytes in the bitmap. */
1943 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1944
1945 /* Clear the whole map. */
1946 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1947
1948 /* charset_not matches newline according to a syntax bit. */
1949 if ((re_opcode_t) b[-2] == charset_not
1950 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1951 SET_LIST_BIT ('\n');
1952
1953 /* Read in characters and ranges, setting map bits. */
1954 for (;;)
1955 {
1956 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1957
1958 PATFETCH (c);
1959
1960 /* \ might escape characters inside [...] and [^...]. */
1961 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1962 {
1963 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1964
1965 PATFETCH (c1);
1966 SET_LIST_BIT (c1);
1967 continue;
1968 }
1969
1970 /* Could be the end of the bracket expression. If it's
1971 not (i.e., when the bracket expression is `[]' so
1972 far), the ']' character bit gets set way below. */
1973 if (c == ']' && p != p1 + 1)
1974 break;
1975
1976 /* Look ahead to see if it's a range when the last thing
1977 was a character class. */
1978 if (had_char_class && c == '-' && *p != ']')
1979 FREE_STACK_RETURN (REG_ERANGE);
1980
1981 /* Look ahead to see if it's a range when the last thing
1982 was a character: if this is a hyphen not at the
1983 beginning or the end of a list, then it's the range
1984 operator. */
1985 if (c == '-'
1986 && !(p - 2 >= pattern && p[-2] == '[')
1987 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1988 && *p != ']')
1989 {
1990 reg_errcode_t ret
1991 = compile_range (&p, pend, translate, syntax, b);
1992 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
1993 }
1994
1995 else if (p[0] == '-' && p[1] != ']')
1996 { /* This handles ranges made up of characters only. */
1997 reg_errcode_t ret;
1998
1999 /* Move past the `-'. */
2000 PATFETCH (c1);
2001
2002 ret = compile_range (&p, pend, translate, syntax, b);
2003 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2004 }
2005
2006 /* See if we're at the beginning of a possible character
2007 class. */
2008
2009 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2010 { /* Leave room for the null. */
2011 char str[CHAR_CLASS_MAX_LENGTH + 1];
2012
2013 PATFETCH (c);
2014 c1 = 0;
2015
2016 /* If pattern is `[[:'. */
2017 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2018
2019 for (;;)
2020 {
2021 PATFETCH (c);
2022 if (c == ':' || c == ']' || p == pend
2023 || c1 == CHAR_CLASS_MAX_LENGTH)
2024 break;
2025 str[c1++] = c;
2026 }
2027 str[c1] = '\0';
2028
2029 /* If isn't a word bracketed by `[:' and:`]':
2030 undo the ending character, the letters, and leave
2031 the leading `:' and `[' (but set bits for them). */
2032 if (c == ':' && *p == ']')
2033 {
2034 int ch;
2035 boolean is_alnum = STREQ (str, "alnum");
2036 boolean is_alpha = STREQ (str, "alpha");
2037 boolean is_blank = STREQ (str, "blank");
2038 boolean is_cntrl = STREQ (str, "cntrl");
2039 boolean is_digit = STREQ (str, "digit");
2040 boolean is_graph = STREQ (str, "graph");
2041 boolean is_lower = STREQ (str, "lower");
2042 boolean is_print = STREQ (str, "print");
2043 boolean is_punct = STREQ (str, "punct");
2044 boolean is_space = STREQ (str, "space");
2045 boolean is_upper = STREQ (str, "upper");
2046 boolean is_xdigit = STREQ (str, "xdigit");
2047
2048 if (!IS_CHAR_CLASS (str))
2049 FREE_STACK_RETURN (REG_ECTYPE);
2050
2051 /* Throw away the ] at the end of the character
2052 class. */
2053 PATFETCH (c);
2054
2055 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2056
2057 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2058 {
2059 /* This was split into 3 if's to
2060 avoid an arbitrary limit in some compiler. */
2061 if ( (is_alnum && ISALNUM (ch))
2062 || (is_alpha && ISALPHA (ch))
2063 || (is_blank && ISBLANK (ch))
2064 || (is_cntrl && ISCNTRL (ch)))
2065 SET_LIST_BIT (ch);
2066 if ( (is_digit && ISDIGIT (ch))
2067 || (is_graph && ISGRAPH (ch))
2068 || (is_lower && ISLOWER (ch))
2069 || (is_print && ISPRINT (ch)))
2070 SET_LIST_BIT (ch);
2071 if ( (is_punct && ISPUNCT (ch))
2072 || (is_space && ISSPACE (ch))
2073 || (is_upper && ISUPPER (ch))
2074 || (is_xdigit && ISXDIGIT (ch)))
2075 SET_LIST_BIT (ch);
2076 }
2077 had_char_class = true;
2078 }
2079 else
2080 {
2081 c1++;
2082 while (c1--)
2083 PATUNFETCH;
2084 SET_LIST_BIT ('[');
2085 SET_LIST_BIT (':');
2086 had_char_class = false;
2087 }
2088 }
2089 else
2090 {
2091 had_char_class = false;
2092 SET_LIST_BIT (c);
2093 }
2094 }
2095
2096 /* Discard any (non)matching list bytes that are all 0 at the
2097 end of the map. Decrease the map-length byte too. */
2098 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2099 b[-1]--;
2100 b += b[-1];
2101 }
2102 break;
2103
2104
2105 case '(':
2106 if (syntax & RE_NO_BK_PARENS)
2107 goto handle_open;
2108 else
2109 goto normal_char;
2110
2111
2112 case ')':
2113 if (syntax & RE_NO_BK_PARENS)
2114 goto handle_close;
2115 else
2116 goto normal_char;
2117
2118
2119 case '\n':
2120 if (syntax & RE_NEWLINE_ALT)
2121 goto handle_alt;
2122 else
2123 goto normal_char;
2124
2125
2126 case '|':
2127 if (syntax & RE_NO_BK_VBAR)
2128 goto handle_alt;
2129 else
2130 goto normal_char;
2131
2132
2133 case '{':
2134 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2135 goto handle_interval;
2136 else
2137 goto normal_char;
2138
2139
2140 case '\\':
2141 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2142
2143 /* Do not translate the character after the \, so that we can
2144 distinguish, e.g., \B from \b, even if we normally would
2145 translate, e.g., B to b. */
2146 PATFETCH_RAW (c);
2147
2148 switch (c)
2149 {
2150 case '(':
2151 if (syntax & RE_NO_BK_PARENS)
2152 goto normal_backslash;
2153
2154 handle_open:
2155 bufp->re_nsub++;
2156 regnum++;
2157
2158 if (COMPILE_STACK_FULL)
2159 {
2160 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2161 compile_stack_elt_t);
2162 if (compile_stack.stack == NULL) return REG_ESPACE;
2163
2164 compile_stack.size <<= 1;
2165 }
2166
2167 /* These are the values to restore when we hit end of this
2168 group. They are all relative offsets, so that if the
2169 whole pattern moves because of realloc, they will still
2170 be valid. */
2171 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2172 COMPILE_STACK_TOP.fixup_alt_jump
2173 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2174 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2175 COMPILE_STACK_TOP.regnum = regnum;
2176
2177 /* We will eventually replace the 0 with the number of
2178 groups inner to this one. But do not push a
2179 start_memory for groups beyond the last one we can
2180 represent in the compiled pattern. */
2181 if (regnum <= MAX_REGNUM)
2182 {
2183 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2184 BUF_PUSH_3 (start_memory, regnum, 0);
2185 }
2186
2187 compile_stack.avail++;
2188
2189 fixup_alt_jump = 0;
2190 laststart = 0;
2191 begalt = b;
2192 /* If we've reached MAX_REGNUM groups, then this open
2193 won't actually generate any code, so we'll have to
2194 clear pending_exact explicitly. */
2195 pending_exact = 0;
2196 break;
2197
2198
2199 case ')':
2200 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2201
2202 if (COMPILE_STACK_EMPTY)
2203 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2204 goto normal_backslash;
2205 else
2206 FREE_STACK_RETURN (REG_ERPAREN);
2207
2208 handle_close:
2209 if (fixup_alt_jump)
2210 { /* Push a dummy failure point at the end of the
2211 alternative for a possible future
2212 `pop_failure_jump' to pop. See comments at
2213 `push_dummy_failure' in `re_match_2'. */
2214 BUF_PUSH (push_dummy_failure);
2215
2216 /* We allocated space for this jump when we assigned
2217 to `fixup_alt_jump', in the `handle_alt' case below. */
2218 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2219 }
2220
2221 /* See similar code for backslashed left paren above. */
2222 if (COMPILE_STACK_EMPTY)
2223 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2224 goto normal_char;
2225 else
2226 FREE_STACK_RETURN (REG_ERPAREN);
2227
2228 /* Since we just checked for an empty stack above, this
2229 ``can't happen''. */
2230 assert (compile_stack.avail != 0);
2231 {
2232 /* We don't just want to restore into `regnum', because
2233 later groups should continue to be numbered higher,
2234 as in `(ab)c(de)' -- the second group is #2. */
2235 regnum_t this_group_regnum;
2236
2237 compile_stack.avail--;
2238 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2239 fixup_alt_jump
2240 = COMPILE_STACK_TOP.fixup_alt_jump
2241 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2242 : 0;
2243 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2244 this_group_regnum = COMPILE_STACK_TOP.regnum;
2245 /* If we've reached MAX_REGNUM groups, then this open
2246 won't actually generate any code, so we'll have to
2247 clear pending_exact explicitly. */
2248 pending_exact = 0;
2249
2250 /* We're at the end of the group, so now we know how many
2251 groups were inside this one. */
2252 if (this_group_regnum <= MAX_REGNUM)
2253 {
2254 unsigned char *inner_group_loc
2255 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2256
2257 *inner_group_loc = regnum - this_group_regnum;
2258 BUF_PUSH_3 (stop_memory, this_group_regnum,
2259 regnum - this_group_regnum);
2260 }
2261 }
2262 break;
2263
2264
2265 case '|': /* `\|'. */
2266 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2267 goto normal_backslash;
2268 handle_alt:
2269 if (syntax & RE_LIMITED_OPS)
2270 goto normal_char;
2271
2272 /* Insert before the previous alternative a jump which
2273 jumps to this alternative if the former fails. */
2274 GET_BUFFER_SPACE (3);
2275 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2276 pending_exact = 0;
2277 b += 3;
2278
2279 /* The alternative before this one has a jump after it
2280 which gets executed if it gets matched. Adjust that
2281 jump so it will jump to this alternative's analogous
2282 jump (put in below, which in turn will jump to the next
2283 (if any) alternative's such jump, etc.). The last such
2284 jump jumps to the correct final destination. A picture:
2285 _____ _____
2286 | | | |
2287 | v | v
2288 a | b | c
2289
2290 If we are at `b', then fixup_alt_jump right now points to a
2291 three-byte space after `a'. We'll put in the jump, set
2292 fixup_alt_jump to right after `b', and leave behind three
2293 bytes which we'll fill in when we get to after `c'. */
2294
2295 if (fixup_alt_jump)
2296 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2297
2298 /* Mark and leave space for a jump after this alternative,
2299 to be filled in later either by next alternative or
2300 when know we're at the end of a series of alternatives. */
2301 fixup_alt_jump = b;
2302 GET_BUFFER_SPACE (3);
2303 b += 3;
2304
2305 laststart = 0;
2306 begalt = b;
2307 break;
2308
2309
2310 case '{':
2311 /* If \{ is a literal. */
2312 if (!(syntax & RE_INTERVALS)
2313 /* If we're at `\{' and it's not the open-interval
2314 operator. */
2315 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2316 || (p - 2 == pattern && p == pend))
2317 goto normal_backslash;
2318
2319 handle_interval:
2320 {
2321 /* If got here, then the syntax allows intervals. */
2322
2323 /* At least (most) this many matches must be made. */
2324 int lower_bound = -1, upper_bound = -1;
2325
2326 beg_interval = p - 1;
2327
2328 if (p == pend)
2329 {
2330 if (syntax & RE_NO_BK_BRACES)
2331 goto unfetch_interval;
2332 else
2333 FREE_STACK_RETURN (REG_EBRACE);
2334 }
2335
2336 GET_UNSIGNED_NUMBER (lower_bound);
2337
2338 if (c == ',')
2339 {
2340 GET_UNSIGNED_NUMBER (upper_bound);
2341 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2342 }
2343 else
2344 /* Interval such as `{1}' => match exactly once. */
2345 upper_bound = lower_bound;
2346
2347 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2348 || lower_bound > upper_bound)
2349 {
2350 if (syntax & RE_NO_BK_BRACES)
2351 goto unfetch_interval;
2352 else
2353 FREE_STACK_RETURN (REG_BADBR);
2354 }
2355
2356 if (!(syntax & RE_NO_BK_BRACES))
2357 {
2358 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2359
2360 PATFETCH (c);
2361 }
2362
2363 if (c != '}')
2364 {
2365 if (syntax & RE_NO_BK_BRACES)
2366 goto unfetch_interval;
2367 else
2368 FREE_STACK_RETURN (REG_BADBR);
2369 }
2370
2371 /* We just parsed a valid interval. */
2372
2373 /* If it's invalid to have no preceding re. */
2374 if (!laststart)
2375 {
2376 if (syntax & RE_CONTEXT_INVALID_OPS)
2377 FREE_STACK_RETURN (REG_BADRPT);
2378 else if (syntax & RE_CONTEXT_INDEP_OPS)
2379 laststart = b;
2380 else
2381 goto unfetch_interval;
2382 }
2383
2384 /* If the upper bound is zero, don't want to succeed at
2385 all; jump from `laststart' to `b + 3', which will be
2386 the end of the buffer after we insert the jump. */
2387 if (upper_bound == 0)
2388 {
2389 GET_BUFFER_SPACE (3);
2390 INSERT_JUMP (jump, laststart, b + 3);
2391 b += 3;
2392 }
2393
2394 /* Otherwise, we have a nontrivial interval. When
2395 we're all done, the pattern will look like:
2396 set_number_at <jump count> <upper bound>
2397 set_number_at <succeed_n count> <lower bound>
2398 succeed_n <after jump addr> <succeed_n count>
2399 <body of loop>
2400 jump_n <succeed_n addr> <jump count>
2401 (The upper bound and `jump_n' are omitted if
2402 `upper_bound' is 1, though.) */
2403 else
2404 { /* If the upper bound is > 1, we need to insert
2405 more at the end of the loop. */
2406 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2407
2408 GET_BUFFER_SPACE (nbytes);
2409
2410 /* Initialize lower bound of the `succeed_n', even
2411 though it will be set during matching by its
2412 attendant `set_number_at' (inserted next),
2413 because `re_compile_fastmap' needs to know.
2414 Jump to the `jump_n' we might insert below. */
2415 INSERT_JUMP2 (succeed_n, laststart,
2416 b + 5 + (upper_bound > 1) * 5,
2417 lower_bound);
2418 b += 5;
2419
2420 /* Code to initialize the lower bound. Insert
2421 before the `succeed_n'. The `5' is the last two
2422 bytes of this `set_number_at', plus 3 bytes of
2423 the following `succeed_n'. */
2424 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2425 b += 5;
2426
2427 if (upper_bound > 1)
2428 { /* More than one repetition is allowed, so
2429 append a backward jump to the `succeed_n'
2430 that starts this interval.
2431
2432 When we've reached this during matching,
2433 we'll have matched the interval once, so
2434 jump back only `upper_bound - 1' times. */
2435 STORE_JUMP2 (jump_n, b, laststart + 5,
2436 upper_bound - 1);
2437 b += 5;
2438
2439 /* The location we want to set is the second
2440 parameter of the `jump_n'; that is `b-2' as
2441 an absolute address. `laststart' will be
2442 the `set_number_at' we're about to insert;
2443 `laststart+3' the number to set, the source
2444 for the relative address. But we are
2445 inserting into the middle of the pattern --
2446 so everything is getting moved up by 5.
2447 Conclusion: (b - 2) - (laststart + 3) + 5,
2448 i.e., b - laststart.
2449
2450 We insert this at the beginning of the loop
2451 so that if we fail during matching, we'll
2452 reinitialize the bounds. */
2453 insert_op2 (set_number_at, laststart, b - laststart,
2454 upper_bound - 1, b);
2455 b += 5;
2456 }
2457 }
2458 pending_exact = 0;
2459 beg_interval = NULL;
2460 }
2461 break;
2462
2463 unfetch_interval:
2464 /* If an invalid interval, match the characters as literals. */
2465 assert (beg_interval);
2466 p = beg_interval;
2467 beg_interval = NULL;
2468
2469 /* normal_char and normal_backslash need `c'. */
2470 PATFETCH (c);
2471
2472 if (!(syntax & RE_NO_BK_BRACES))
2473 {
2474 if (p > pattern && p[-1] == '\\')
2475 goto normal_backslash;
2476 }
2477 goto normal_char;
2478
2479#ifdef emacs
2480 /* There is no way to specify the before_dot and after_dot
2481 operators. rms says this is ok. --karl */
2482 case '=':
2483 BUF_PUSH (at_dot);
2484 break;
2485
2486 case 's':
2487 laststart = b;
2488 PATFETCH (c);
2489 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2490 break;
2491
2492 case 'S':
2493 laststart = b;
2494 PATFETCH (c);
2495 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2496 break;
2497#endif /* emacs */
2498
2499
2500 case 'w':
2501 laststart = b;
2502 BUF_PUSH (wordchar);
2503 break;
2504
2505
2506 case 'W':
2507 laststart = b;
2508 BUF_PUSH (notwordchar);
2509 break;
2510
2511
2512 case '<':
2513 BUF_PUSH (wordbeg);
2514 break;
2515
2516 case '>':
2517 BUF_PUSH (wordend);
2518 break;
2519
2520 case 'b':
2521 BUF_PUSH (wordbound);
2522 break;
2523
2524 case 'B':
2525 BUF_PUSH (notwordbound);
2526 break;
2527
2528 case '`':
2529 BUF_PUSH (begbuf);
2530 break;
2531
2532 case '\'':
2533 BUF_PUSH (endbuf);
2534 break;
2535
2536 case '1': case '2': case '3': case '4': case '5':
2537 case '6': case '7': case '8': case '9':
2538 if (syntax & RE_NO_BK_REFS)
2539 goto normal_char;
2540
2541 c1 = c - '0';
2542
2543 if (c1 > regnum)
2544 FREE_STACK_RETURN (REG_ESUBREG);
2545
2546 /* Can't back reference to a subexpression if inside of it. */
2547 if (group_in_compile_stack (compile_stack, c1))
2548 goto normal_char;
2549
2550 laststart = b;
2551 BUF_PUSH_2 (duplicate, c1);
2552 break;
2553
2554
2555 case '+':
2556 case '?':
2557 if (syntax & RE_BK_PLUS_QM)
2558 goto handle_plus;
2559 else
2560 goto normal_backslash;
2561
2562 default:
2563 normal_backslash:
2564 /* You might think it would be useful for \ to mean
2565 not to translate; but if we don't translate it
2566 it will never match anything. */
2567 c = TRANSLATE (c);
2568 goto normal_char;
2569 }
2570 break;
2571
2572
2573 default:
2574 /* Expects the character in `c'. */
2575 normal_char:
2576 /* If no exactn currently being built. */
2577 if (!pending_exact
2578
2579 /* If last exactn not at current position. */
2580 || pending_exact + *pending_exact + 1 != b
2581
2582 /* We have only one byte following the exactn for the count. */
2583 || *pending_exact == (1 << BYTEWIDTH) - 1
2584
2585 /* If followed by a repetition operator. */
2586 || *p == '*' || *p == '^'
2587 || ((syntax & RE_BK_PLUS_QM)
2588 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2589 : (*p == '+' || *p == '?'))
2590 || ((syntax & RE_INTERVALS)
2591 && ((syntax & RE_NO_BK_BRACES)
2592 ? *p == '{'
2593 : (p[0] == '\\' && p[1] == '{'))))
2594 {
2595 /* Start building a new exactn. */
2596
2597 laststart = b;
2598
2599 BUF_PUSH_2 (exactn, 0);
2600 pending_exact = b - 1;
2601 }
2602
2603 BUF_PUSH (c);
2604 (*pending_exact)++;
2605 break;
2606 } /* switch (c) */
2607 } /* while p != pend */
2608
2609
2610 /* Through the pattern now. */
2611
2612 if (fixup_alt_jump)
2613 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2614
2615 if (!COMPILE_STACK_EMPTY)
2616 FREE_STACK_RETURN (REG_EPAREN);
2617
2618 /* If we don't want backtracking, force success
2619 the first time we reach the end of the compiled pattern. */
2620 if (syntax & RE_NO_POSIX_BACKTRACKING)
2621 BUF_PUSH (succeed);
2622
2623 free (compile_stack.stack);
2624
2625 /* We have succeeded; set the length of the buffer. */
2626 bufp->used = b - bufp->buffer;
2627
2628#ifdef DEBUG
2629 if (debug)
2630 {
2631 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2632 print_compiled_pattern (bufp);
2633 }
2634#endif /* DEBUG */
2635
2636#ifndef MATCH_MAY_ALLOCATE
2637 /* Initialize the failure stack to the largest possible stack. This
2638 isn't necessary unless we're trying to avoid calling alloca in
2639 the search and match routines. */
2640 {
2641 int num_regs = bufp->re_nsub + 1;
2642
2643 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2644 is strictly greater than re_max_failures, the largest possible stack
2645 is 2 * re_max_failures failure points. */
2646 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2647 {
2648 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2649
2650#ifdef emacs
2651 if (! fail_stack.stack)
2652 fail_stack.stack
2653 = (fail_stack_elt_t *) xmalloc (fail_stack.size
2654 * sizeof (fail_stack_elt_t));
2655 else
2656 fail_stack.stack
2657 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2658 (fail_stack.size
2659 * sizeof (fail_stack_elt_t)));
2660#else /* not emacs */
2661 if (! fail_stack.stack)
2662 fail_stack.stack
2663 = (fail_stack_elt_t *) malloc (fail_stack.size
2664 * sizeof (fail_stack_elt_t));
2665 else
2666 fail_stack.stack
2667 = (fail_stack_elt_t *) realloc (fail_stack.stack,
2668 (fail_stack.size
2669 * sizeof (fail_stack_elt_t)));
2670#endif /* not emacs */
2671 }
2672
2673 regex_grow_registers (num_regs);
2674 }
2675#endif /* not MATCH_MAY_ALLOCATE */
2676
2677 return REG_NOERROR;
2678} /* regex_compile */
2679
2680
2681/* Subroutines for `regex_compile'. */
2682
2683/* Store OP at LOC followed by two-byte integer parameter ARG. */
2684
2685static void
2686store_op1 (op, loc, arg)
2687 re_opcode_t op;
2688 unsigned char *loc;
2689 int arg;
2690{
2691 *loc = (unsigned char) op;
2692 STORE_NUMBER (loc + 1, arg);
2693}
2694
2695
2696/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2697
2698static void
2699store_op2 (op, loc, arg1, arg2)
2700 re_opcode_t op;
2701 unsigned char *loc;
2702 int arg1, arg2;
2703{
2704 *loc = (unsigned char) op;
2705 STORE_NUMBER (loc + 1, arg1);
2706 STORE_NUMBER (loc + 3, arg2);
2707}
2708
2709
2710/* Copy the bytes from LOC to END to open up three bytes of space at LOC
2711 for OP followed by two-byte integer parameter ARG. */
2712
2713static void
2714insert_op1 (op, loc, arg, end)
2715 re_opcode_t op;
2716 unsigned char *loc;
2717 int arg;
2718 unsigned char *end;
2719{
2720 register unsigned char *pfrom = end;
2721 register unsigned char *pto = end + 3;
2722
2723 while (pfrom != loc)
2724 *--pto = *--pfrom;
2725
2726 store_op1 (op, loc, arg);
2727}
2728
2729
2730/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2731
2732static void
2733insert_op2 (op, loc, arg1, arg2, end)
2734 re_opcode_t op;
2735 unsigned char *loc;
2736 int arg1, arg2;
2737 unsigned char *end;
2738{
2739 register unsigned char *pfrom = end;
2740 register unsigned char *pto = end + 5;
2741
2742 while (pfrom != loc)
2743 *--pto = *--pfrom;
2744
2745 store_op2 (op, loc, arg1, arg2);
2746}
2747
2748
2749/* P points to just after a ^ in PATTERN. Return true if that ^ comes
2750 after an alternative or a begin-subexpression. We assume there is at
2751 least one character before the ^. */
2752
2753static boolean
2754at_begline_loc_p (pattern, p, syntax)
2755 const char *pattern, *p;
2756 reg_syntax_t syntax;
2757{
2758 const char *prev = p - 2;
2759 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2760
2761 return
2762 /* After a subexpression? */
2763 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2764 /* After an alternative? */
2765 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2766}
2767
2768
2769/* The dual of at_begline_loc_p. This one is for $. We assume there is
2770 at least one character after the $, i.e., `P < PEND'. */
2771
2772static boolean
2773at_endline_loc_p (p, pend, syntax)
2774 const char *p, *pend;
2775 int syntax;
2776{
2777 const char *next = p;
2778 boolean next_backslash = *next == '\\';
2779 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2780
2781 return
2782 /* Before a subexpression? */
2783 (syntax & RE_NO_BK_PARENS ? *next == ')'
2784 : next_backslash && next_next && *next_next == ')')
2785 /* Before an alternative? */
2786 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2787 : next_backslash && next_next && *next_next == '|');
2788}
2789
2790
2791/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2792 false if it's not. */
2793
2794static boolean
2795group_in_compile_stack (compile_stack, regnum)
2796 compile_stack_type compile_stack;
2797 regnum_t regnum;
2798{
2799 int this_element;
2800
2801 for (this_element = compile_stack.avail - 1;
2802 this_element >= 0;
2803 this_element--)
2804 if (compile_stack.stack[this_element].regnum == regnum)
2805 return true;
2806
2807 return false;
2808}
2809
2810
2811/* Read the ending character of a range (in a bracket expression) from the
2812 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2813 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2814 Then we set the translation of all bits between the starting and
2815 ending characters (inclusive) in the compiled pattern B.
2816
2817 Return an error code.
2818
2819 We use these short variable names so we can use the same macros as
2820 `regex_compile' itself. */
2821
2822static reg_errcode_t
2823compile_range (p_ptr, pend, translate, syntax, b)
2824 const char **p_ptr, *pend;
2825 char *translate;
2826 reg_syntax_t syntax;
2827 unsigned char *b;
2828{
2829 unsigned this_char;
2830
2831 const char *p = *p_ptr;
2832 int range_start, range_end;
2833
2834 if (p == pend)
2835 return REG_ERANGE;
2836
2837 /* Even though the pattern is a signed `char *', we need to fetch
2838 with unsigned char *'s; if the high bit of the pattern character
2839 is set, the range endpoints will be negative if we fetch using a
2840 signed char *.
2841
2842 We also want to fetch the endpoints without translating them; the
2843 appropriate translation is done in the bit-setting loop below. */
2844 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
2845 range_start = ((const unsigned char *) p)[-2];
2846 range_end = ((const unsigned char *) p)[0];
2847
2848 /* Have to increment the pointer into the pattern string, so the
2849 caller isn't still at the ending character. */
2850 (*p_ptr)++;
2851
2852 /* If the start is after the end, the range is empty. */
2853 if (range_start > range_end)
2854 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2855
2856 /* Here we see why `this_char' has to be larger than an `unsigned
2857 char' -- the range is inclusive, so if `range_end' == 0xff
2858 (assuming 8-bit characters), we would otherwise go into an infinite
2859 loop, since all characters <= 0xff. */
2860 for (this_char = range_start; this_char <= range_end; this_char++)
2861 {
2862 SET_LIST_BIT (TRANSLATE (this_char));
2863 }
2864
2865 return REG_NOERROR;
2866}
2867
2868
2869/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2870 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2871 characters can start a string that matches the pattern. This fastmap
2872 is used by re_search to skip quickly over impossible starting points.
2873
2874 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2875 area as BUFP->fastmap.
2876
2877 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2878 the pattern buffer.
2879
2880 Returns 0 if we succeed, -2 if an internal error. */
2881
2882int
2883re_compile_fastmap (bufp)
2884 struct re_pattern_buffer *bufp;
2885{
2886 int j, k;
2887#ifdef MATCH_MAY_ALLOCATE
2888 fail_stack_type fail_stack;
2889#endif
2890#ifndef REGEX_MALLOC
2891 char *destination;
2892#endif
2893 /* We don't push any register information onto the failure stack. */
2894 unsigned num_regs = 0;
2895
2896 register char *fastmap = bufp->fastmap;
2897 unsigned char *pattern = bufp->buffer;
2898 unsigned long size = bufp->used;
2899 unsigned char *p = pattern;
2900 register unsigned char *pend = pattern + size;
2901
2902 /* This holds the pointer to the failure stack, when
2903 it is allocated relocatably. */
2904 fail_stack_elt_t *failure_stack_ptr;
2905
2906 /* Assume that each path through the pattern can be null until
2907 proven otherwise. We set this false at the bottom of switch
2908 statement, to which we get only if a particular path doesn't
2909 match the empty string. */
2910 boolean path_can_be_null = true;
2911
2912 /* We aren't doing a `succeed_n' to begin with. */
2913 boolean succeed_n_p = false;
2914
2915 assert (fastmap != NULL && p != NULL);
2916
2917 INIT_FAIL_STACK ();
2918 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2919 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2920 bufp->can_be_null = 0;
2921
2922 while (1)
2923 {
2924 if (p == pend || *p == succeed)
2925 {
2926 /* We have reached the (effective) end of pattern. */
2927 if (!FAIL_STACK_EMPTY ())
2928 {
2929 bufp->can_be_null |= path_can_be_null;
2930
2931 /* Reset for next path. */
2932 path_can_be_null = true;
2933
2934 p = fail_stack.stack[--fail_stack.avail].pointer;
2935
2936 continue;
2937 }
2938 else
2939 break;
2940 }
2941
2942 /* We should never be about to go beyond the end of the pattern. */
2943 assert (p < pend);
2944
2945 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
2946 {
2947
2948 /* I guess the idea here is to simply not bother with a fastmap
2949 if a backreference is used, since it's too hard to figure out
2950 the fastmap for the corresponding group. Setting
2951 `can_be_null' stops `re_search_2' from using the fastmap, so
2952 that is all we do. */
2953 case duplicate:
2954 bufp->can_be_null = 1;
2955 goto done;
2956
2957
2958 /* Following are the cases which match a character. These end
2959 with `break'. */
2960
2961 case exactn:
2962 fastmap[p[1]] = 1;
2963 break;
2964
2965
2966 case charset:
2967 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2968 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2969 fastmap[j] = 1;
2970 break;
2971
2972
2973 case charset_not:
2974 /* Chars beyond end of map must be allowed. */
2975 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2976 fastmap[j] = 1;
2977
2978 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2979 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2980 fastmap[j] = 1;
2981 break;
2982
2983
2984 case wordchar:
2985 for (j = 0; j < (1 << BYTEWIDTH); j++)
2986 if (SYNTAX (j) == Sword)
2987 fastmap[j] = 1;
2988 break;
2989
2990
2991 case notwordchar:
2992 for (j = 0; j < (1 << BYTEWIDTH); j++)
2993 if (SYNTAX (j) != Sword)
2994 fastmap[j] = 1;
2995 break;
2996
2997
2998 case anychar:
2999 {
3000 int fastmap_newline = fastmap['\n'];
3001
3002 /* `.' matches anything ... */
3003 for (j = 0; j < (1 << BYTEWIDTH); j++)
3004 fastmap[j] = 1;
3005
3006 /* ... except perhaps newline. */
3007 if (!(bufp->syntax & RE_DOT_NEWLINE))
3008 fastmap['\n'] = fastmap_newline;
3009
3010 /* Return if we have already set `can_be_null'; if we have,
3011 then the fastmap is irrelevant. Something's wrong here. */
3012 else if (bufp->can_be_null)
3013 goto done;
3014
3015 /* Otherwise, have to check alternative paths. */
3016 break;
3017 }
3018
3019#ifdef emacs
3020 case syntaxspec:
3021 k = *p++;
3022 for (j = 0; j < (1 << BYTEWIDTH); j++)
3023 if (SYNTAX (j) == (enum syntaxcode) k)
3024 fastmap[j] = 1;
3025 break;
3026
3027
3028 case notsyntaxspec:
3029 k = *p++;
3030 for (j = 0; j < (1 << BYTEWIDTH); j++)
3031 if (SYNTAX (j) != (enum syntaxcode) k)
3032 fastmap[j] = 1;
3033 break;
3034
3035
3036 /* All cases after this match the empty string. These end with
3037 `continue'. */
3038
3039
3040 case before_dot:
3041 case at_dot:
3042 case after_dot:
3043 continue;
3044#endif /* not emacs */
3045
3046
3047 case no_op:
3048 case begline:
3049 case endline:
3050 case begbuf:
3051 case endbuf:
3052 case wordbound:
3053 case notwordbound:
3054 case wordbeg:
3055 case wordend:
3056 case push_dummy_failure:
3057 continue;
3058
3059
3060 case jump_n:
3061 case pop_failure_jump:
3062 case maybe_pop_jump:
3063 case jump:
3064 case jump_past_alt:
3065 case dummy_failure_jump:
3066 EXTRACT_NUMBER_AND_INCR (j, p);
3067 p += j;
3068 if (j > 0)
3069 continue;
3070
3071 /* Jump backward implies we just went through the body of a
3072 loop and matched nothing. Opcode jumped to should be
3073 `on_failure_jump' or `succeed_n'. Just treat it like an
3074 ordinary jump. For a * loop, it has pushed its failure
3075 point already; if so, discard that as redundant. */
3076 if ((re_opcode_t) *p != on_failure_jump
3077 && (re_opcode_t) *p != succeed_n)
3078 continue;
3079
3080 p++;
3081 EXTRACT_NUMBER_AND_INCR (j, p);
3082 p += j;
3083
3084 /* If what's on the stack is where we are now, pop it. */
3085 if (!FAIL_STACK_EMPTY ()
3086 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3087 fail_stack.avail--;
3088
3089 continue;
3090
3091
3092 case on_failure_jump:
3093 case on_failure_keep_string_jump:
3094 handle_on_failure_jump:
3095 EXTRACT_NUMBER_AND_INCR (j, p);
3096
3097 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3098 end of the pattern. We don't want to push such a point,
3099 since when we restore it above, entering the switch will
3100 increment `p' past the end of the pattern. We don't need
3101 to push such a point since we obviously won't find any more
3102 fastmap entries beyond `pend'. Such a pattern can match
3103 the null string, though. */
3104 if (p + j < pend)
3105 {
3106 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3107 {
3108 RESET_FAIL_STACK ();
3109 return -2;
3110 }
3111 }
3112 else
3113 bufp->can_be_null = 1;
3114
3115 if (succeed_n_p)
3116 {
3117 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3118 succeed_n_p = false;
3119 }
3120
3121 continue;
3122
3123
3124 case succeed_n:
3125 /* Get to the number of times to succeed. */
3126 p += 2;
3127
3128 /* Increment p past the n for when k != 0. */
3129 EXTRACT_NUMBER_AND_INCR (k, p);
3130 if (k == 0)
3131 {
3132 p -= 4;
3133 succeed_n_p = true; /* Spaghetti code alert. */
3134 goto handle_on_failure_jump;
3135 }
3136 continue;
3137
3138
3139 case set_number_at:
3140 p += 4;
3141 continue;
3142
3143
3144 case start_memory:
3145 case stop_memory:
3146 p += 2;
3147 continue;
3148
3149
3150 default:
3151 abort (); /* We have listed all the cases. */
3152 } /* switch *p++ */
3153
3154 /* Getting here means we have found the possible starting
3155 characters for one path of the pattern -- and that the empty
3156 string does not match. We need not follow this path further.
3157 Instead, look at the next alternative (remembered on the
3158 stack), or quit if no more. The test at the top of the loop
3159 does these things. */
3160 path_can_be_null = false;
3161 p = pend;
3162 } /* while p */
3163
3164 /* Set `can_be_null' for the last path (also the first path, if the
3165 pattern is empty). */
3166 bufp->can_be_null |= path_can_be_null;
3167
3168 done:
3169 RESET_FAIL_STACK ();
3170 return 0;
3171} /* re_compile_fastmap */
3172
3173
3174/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3175 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3176 this memory for recording register information. STARTS and ENDS
3177 must be allocated using the malloc library routine, and must each
3178 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3179
3180 If NUM_REGS == 0, then subsequent matches should allocate their own
3181 register data.
3182
3183 Unless this function is called, the first search or match using
3184 PATTERN_BUFFER will allocate its own register data, without
3185 freeing the old data. */
3186
3187void
3188re_set_registers (bufp, regs, num_regs, starts, ends)
3189 struct re_pattern_buffer *bufp;
3190 struct re_registers *regs;
3191 unsigned num_regs;
3192 regoff_t *starts, *ends;
3193{
3194 if (num_regs)
3195 {
3196 bufp->regs_allocated = REGS_REALLOCATE;
3197 regs->num_regs = num_regs;
3198 regs->start = starts;
3199 regs->end = ends;
3200 }
3201 else
3202 {
3203 bufp->regs_allocated = REGS_UNALLOCATED;
3204 regs->num_regs = 0;
3205 regs->start = regs->end = (regoff_t *) 0;
3206 }
3207}
3208
3209
3210/* Searching routines. */
3211
3212/* Like re_search_2, below, but only one string is specified, and
3213 doesn't let you say where to stop matching. */
3214
3215int
3216re_search (bufp, string, size, startpos, range, regs)
3217 struct re_pattern_buffer *bufp;
3218 const char *string;
3219 int size, startpos, range;
3220 struct re_registers *regs;
3221{
3222 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3223 regs, size);
3224}
3225
3226
3227/* Using the compiled pattern in BUFP->buffer, first tries to match the
3228 virtual concatenation of STRING1 and STRING2, starting first at index
3229 STARTPOS, then at STARTPOS + 1, and so on.
3230
3231 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3232
3233 RANGE is how far to scan while trying to match. RANGE = 0 means try
3234 only at STARTPOS; in general, the last start tried is STARTPOS +
3235 RANGE.
3236
3237 In REGS, return the indices of the virtual concatenation of STRING1
3238 and STRING2 that matched the entire BUFP->buffer and its contained
3239 subexpressions.
3240
3241 Do not consider matching one past the index STOP in the virtual
3242 concatenation of STRING1 and STRING2.
3243
3244 We return either the position in the strings at which the match was
3245 found, -1 if no match, or -2 if error (such as failure
3246 stack overflow). */
3247
3248int
3249re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3250 struct re_pattern_buffer *bufp;
3251 const char *string1, *string2;
3252 int size1, size2;
3253 int startpos;
3254 int range;
3255 struct re_registers *regs;
3256 int stop;
3257{
3258 int val;
3259 register char *fastmap = bufp->fastmap;
3260 register char *translate = bufp->translate;
3261 int total_size = size1 + size2;
3262 int endpos = startpos + range;
3263
3264 /* Check for out-of-range STARTPOS. */
3265 if (startpos < 0 || startpos > total_size)
3266 return -1;
3267
3268 /* Fix up RANGE if it might eventually take us outside
3269 the virtual concatenation of STRING1 and STRING2. */
3270 if (endpos < -1)
3271 range = -1 - startpos;
3272 else if (endpos > total_size)
3273 range = total_size - startpos;
3274
3275 /* If the search isn't to be a backwards one, don't waste time in a
3276 search for a pattern that must be anchored. */
3277 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3278 {
3279 if (startpos > 0)
3280 return -1;
3281 else
3282 range = 1;
3283 }
3284
3285 /* Update the fastmap now if not correct already. */
3286 if (fastmap && !bufp->fastmap_accurate)
3287 if (re_compile_fastmap (bufp) == -2)
3288 return -2;
3289
3290 /* Loop through the string, looking for a place to start matching. */
3291 for (;;)
3292 {
3293 /* If a fastmap is supplied, skip quickly over characters that
3294 cannot be the start of a match. If the pattern can match the
3295 null string, however, we don't need to skip characters; we want
3296 the first null string. */
3297 if (fastmap && startpos < total_size && !bufp->can_be_null)
3298 {
3299 if (range > 0) /* Searching forwards. */
3300 {
3301 register const char *d;
3302 register int lim = 0;
3303 int irange = range;
3304
3305 if (startpos < size1 && startpos + range >= size1)
3306 lim = range - (size1 - startpos);
3307
3308 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3309
3310 /* Written out as an if-else to avoid testing `translate'
3311 inside the loop. */
3312 if (translate)
3313 while (range > lim
3314 && !fastmap[(unsigned char)
3315 translate[(unsigned char) *d++]])
3316 range--;
3317 else
3318 while (range > lim && !fastmap[(unsigned char) *d++])
3319 range--;
3320
3321 startpos += irange - range;
3322 }
3323 else /* Searching backwards. */
3324 {
3325 register char c = (size1 == 0 || startpos >= size1
3326 ? string2[startpos - size1]
3327 : string1[startpos]);
3328
3329 if (!fastmap[(unsigned char) TRANSLATE (c)])
3330 goto advance;
3331 }
3332 }
3333
3334 /* If can't match the null string, and that's all we have left, fail. */
3335 if (range >= 0 && startpos == total_size && fastmap
3336 && !bufp->can_be_null)
3337 return -1;
3338
3339 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3340 startpos, regs, stop);
3341#ifndef REGEX_MALLOC
3342#ifdef C_ALLOCA
3343 alloca (0);
3344#endif
3345#endif
3346
3347 if (val >= 0)
3348 return startpos;
3349
3350 if (val == -2)
3351 return -2;
3352
3353 advance:
3354 if (!range)
3355 break;
3356 else if (range > 0)
3357 {
3358 range--;
3359 startpos++;
3360 }
3361 else
3362 {
3363 range++;
3364 startpos--;
3365 }
3366 }
3367 return -1;
3368} /* re_search_2 */
3369
3370
3371/* Declarations and macros for re_match_2. */
3372
3373static int bcmp_translate ();
3374static boolean alt_match_null_string_p (),
3375 common_op_match_null_string_p (),
3376 group_match_null_string_p ();
3377
3378/* This converts PTR, a pointer into one of the search strings `string1'
3379 and `string2' into an offset from the beginning of that string. */
3380#define POINTER_TO_OFFSET(ptr) \
3381 (FIRST_STRING_P (ptr) \
3382 ? ((regoff_t) ((ptr) - string1)) \
3383 : ((regoff_t) ((ptr) - string2 + size1)))
3384
3385/* Macros for dealing with the split strings in re_match_2. */
3386
3387#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3388
3389/* Call before fetching a character with *d. This switches over to
3390 string2 if necessary. */
3391#define PREFETCH() \
3392 while (d == dend) \
3393 { \
3394 /* End of string2 => fail. */ \
3395 if (dend == end_match_2) \
3396 goto fail; \
3397 /* End of string1 => advance to string2. */ \
3398 d = string2; \
3399 dend = end_match_2; \
3400 }
3401
3402
3403/* Test if at very beginning or at very end of the virtual concatenation
3404 of `string1' and `string2'. If only one string, it's `string2'. */
3405#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3406#define AT_STRINGS_END(d) ((d) == end2)
3407
3408
3409/* Test if D points to a character which is word-constituent. We have
3410 two special cases to check for: if past the end of string1, look at
3411 the first character in string2; and if before the beginning of
3412 string2, look at the last character in string1. */
3413#define WORDCHAR_P(d) \
3414 (SYNTAX ((d) == end1 ? *string2 \
3415 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3416 == Sword)
3417
3418/* Test if the character before D and the one at D differ with respect
3419 to being word-constituent. */
3420#define AT_WORD_BOUNDARY(d) \
3421 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3422 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3423
3424
3425/* Free everything we malloc. */
3426#ifdef MATCH_MAY_ALLOCATE
3427#define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3428#define FREE_VARIABLES() \
3429 do { \
3430 REGEX_FREE_STACK (fail_stack.stack); \
3431 FREE_VAR (regstart); \
3432 FREE_VAR (regend); \
3433 FREE_VAR (old_regstart); \
3434 FREE_VAR (old_regend); \
3435 FREE_VAR (best_regstart); \
3436 FREE_VAR (best_regend); \
3437 FREE_VAR (reg_info); \
3438 FREE_VAR (reg_dummy); \
3439 FREE_VAR (reg_info_dummy); \
3440 } while (0)
3441#else
3442#define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3443#endif /* not MATCH_MAY_ALLOCATE */
3444
3445/* These values must meet several constraints. They must not be valid
3446 register values; since we have a limit of 255 registers (because
3447 we use only one byte in the pattern for the register number), we can
3448 use numbers larger than 255. They must differ by 1, because of
3449 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3450 be larger than the value for the highest register, so we do not try
3451 to actually save any registers when none are active. */
3452#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3453#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3454
3455
3456/* Matching routines. */
3457
3458#ifndef emacs /* Emacs never uses this. */
3459/* re_match is like re_match_2 except it takes only a single string. */
3460
3461int
3462re_match (bufp, string, size, pos, regs)
3463 struct re_pattern_buffer *bufp;
3464 const char *string;
3465 int size, pos;
3466 struct re_registers *regs;
3467{
3468 int result = re_match_2_internal (bufp, NULL, 0, string, size,
3469 pos, regs, size);
3470 alloca (0);
3471 return result;
3472}
3473#endif /* not emacs */
3474
3475
3476/* re_match_2 matches the compiled pattern in BUFP against the
3477 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3478 and SIZE2, respectively). We start matching at POS, and stop
3479 matching at STOP.
3480
3481 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3482 store offsets for the substring each group matched in REGS. See the
3483 documentation for exactly how many groups we fill.
3484
3485 We return -1 if no match, -2 if an internal error (such as the
3486 failure stack overflowing). Otherwise, we return the length of the
3487 matched substring. */
3488
3489int
3490re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3491 struct re_pattern_buffer *bufp;
3492 const char *string1, *string2;
3493 int size1, size2;
3494 int pos;
3495 struct re_registers *regs;
3496 int stop;
3497{
3498 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3499 pos, regs, stop);
3500 alloca (0);
3501 return result;
3502}
3503
3504/* This is a separate function so that we can force an alloca cleanup
3505 afterwards. */
3506static int
3507re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
3508 struct re_pattern_buffer *bufp;
3509 const char *string1, *string2;
3510 int size1, size2;
3511 int pos;
3512 struct re_registers *regs;
3513 int stop;
3514{
3515 /* General temporaries. */
3516 int mcnt;
3517 unsigned char *p1;
3518
3519 /* Just past the end of the corresponding string. */
3520 const char *end1, *end2;
3521
3522 /* Pointers into string1 and string2, just past the last characters in
3523 each to consider matching. */
3524 const char *end_match_1, *end_match_2;
3525
3526 /* Where we are in the data, and the end of the current string. */
3527 const char *d, *dend;
3528
3529 /* Where we are in the pattern, and the end of the pattern. */
3530 unsigned char *p = bufp->buffer;
3531 register unsigned char *pend = p + bufp->used;
3532
3533 /* Mark the opcode just after a start_memory, so we can test for an
3534 empty subpattern when we get to the stop_memory. */
3535 unsigned char *just_past_start_mem = 0;
3536
3537 /* We use this to map every character in the string. */
3538 char *translate = bufp->translate;
3539
3540 /* Failure point stack. Each place that can handle a failure further
3541 down the line pushes a failure point on this stack. It consists of
3542 restart, regend, and reg_info for all registers corresponding to
3543 the subexpressions we're currently inside, plus the number of such
3544 registers, and, finally, two char *'s. The first char * is where
3545 to resume scanning the pattern; the second one is where to resume
3546 scanning the strings. If the latter is zero, the failure point is
3547 a ``dummy''; if a failure happens and the failure point is a dummy,
3548 it gets discarded and the next next one is tried. */
3549#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3550 fail_stack_type fail_stack;
3551#endif
3552#ifdef DEBUG
3553 static unsigned failure_id = 0;
3554 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3555#endif
3556
3557 /* This holds the pointer to the failure stack, when
3558 it is allocated relocatably. */
3559 fail_stack_elt_t *failure_stack_ptr;
3560
3561 /* We fill all the registers internally, independent of what we
3562 return, for use in backreferences. The number here includes
3563 an element for register zero. */
3564 unsigned num_regs = bufp->re_nsub + 1;
3565
3566 /* The currently active registers. */
3567 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3568 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3569
3570 /* Information on the contents of registers. These are pointers into
3571 the input strings; they record just what was matched (on this
3572 attempt) by a subexpression part of the pattern, that is, the
3573 regnum-th regstart pointer points to where in the pattern we began
3574 matching and the regnum-th regend points to right after where we
3575 stopped matching the regnum-th subexpression. (The zeroth register
3576 keeps track of what the whole pattern matches.) */
3577#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3578 const char **regstart, **regend;
3579#endif
3580
3581 /* If a group that's operated upon by a repetition operator fails to
3582 match anything, then the register for its start will need to be
3583 restored because it will have been set to wherever in the string we
3584 are when we last see its open-group operator. Similarly for a
3585 register's end. */
3586#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3587 const char **old_regstart, **old_regend;
3588#endif
3589
3590 /* The is_active field of reg_info helps us keep track of which (possibly
3591 nested) subexpressions we are currently in. The matched_something
3592 field of reg_info[reg_num] helps us tell whether or not we have
3593 matched any of the pattern so far this time through the reg_num-th
3594 subexpression. These two fields get reset each time through any
3595 loop their register is in. */
3596#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3597 register_info_type *reg_info;
3598#endif
3599
3600 /* The following record the register info as found in the above
3601 variables when we find a match better than any we've seen before.
3602 This happens as we backtrack through the failure points, which in
3603 turn happens only if we have not yet matched the entire string. */
3604 unsigned best_regs_set = false;
3605#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3606 const char **best_regstart, **best_regend;
3607#endif
3608
3609 /* Logically, this is `best_regend[0]'. But we don't want to have to
3610 allocate space for that if we're not allocating space for anything
3611 else (see below). Also, we never need info about register 0 for
3612 any of the other register vectors, and it seems rather a kludge to
3613 treat `best_regend' differently than the rest. So we keep track of
3614 the end of the best match so far in a separate variable. We
3615 initialize this to NULL so that when we backtrack the first time
3616 and need to test it, it's not garbage. */
3617 const char *match_end = NULL;
3618
3619 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
3620 int set_regs_matched_done = 0;
3621
3622 /* Used when we pop values we don't care about. */
3623#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3624 const char **reg_dummy;
3625 register_info_type *reg_info_dummy;
3626#endif
3627
3628#ifdef DEBUG
3629 /* Counts the total number of registers pushed. */
3630 unsigned num_regs_pushed = 0;
3631#endif
3632
3633 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3634
3635 INIT_FAIL_STACK ();
3636
3637#ifdef MATCH_MAY_ALLOCATE
3638 /* Do not bother to initialize all the register variables if there are
3639 no groups in the pattern, as it takes a fair amount of time. If
3640 there are groups, we include space for register 0 (the whole
3641 pattern), even though we never use it, since it simplifies the
3642 array indexing. We should fix this. */
3643 if (bufp->re_nsub)
3644 {
3645 regstart = REGEX_TALLOC (num_regs, const char *);
3646 regend = REGEX_TALLOC (num_regs, const char *);
3647 old_regstart = REGEX_TALLOC (num_regs, const char *);
3648 old_regend = REGEX_TALLOC (num_regs, const char *);
3649 best_regstart = REGEX_TALLOC (num_regs, const char *);
3650 best_regend = REGEX_TALLOC (num_regs, const char *);
3651 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3652 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3653 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3654
3655 if (!(regstart && regend && old_regstart && old_regend && reg_info
3656 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3657 {
3658 FREE_VARIABLES ();
3659 return -2;
3660 }
3661 }
3662 else
3663 {
3664 /* We must initialize all our variables to NULL, so that
3665 `FREE_VARIABLES' doesn't try to free them. */
3666 regstart = regend = old_regstart = old_regend = best_regstart
3667 = best_regend = reg_dummy = NULL;
3668 reg_info = reg_info_dummy = (register_info_type *) NULL;
3669 }
3670#endif /* MATCH_MAY_ALLOCATE */
3671
3672 /* The starting position is bogus. */
3673 if (pos < 0 || pos > size1 + size2)
3674 {
3675 FREE_VARIABLES ();
3676 return -1;
3677 }
3678
3679 /* Initialize subexpression text positions to -1 to mark ones that no
3680 start_memory/stop_memory has been seen for. Also initialize the
3681 register information struct. */
3682 for (mcnt = 1; mcnt < num_regs; mcnt++)
3683 {
3684 regstart[mcnt] = regend[mcnt]
3685 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3686
3687 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3688 IS_ACTIVE (reg_info[mcnt]) = 0;
3689 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3690 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3691 }
3692
3693 /* We move `string1' into `string2' if the latter's empty -- but not if
3694 `string1' is null. */
3695 if (size2 == 0 && string1 != NULL)
3696 {
3697 string2 = string1;
3698 size2 = size1;
3699 string1 = 0;
3700 size1 = 0;
3701 }
3702 end1 = string1 + size1;
3703 end2 = string2 + size2;
3704
3705 /* Compute where to stop matching, within the two strings. */
3706 if (stop <= size1)
3707 {
3708 end_match_1 = string1 + stop;
3709 end_match_2 = string2;
3710 }
3711 else
3712 {
3713 end_match_1 = end1;
3714 end_match_2 = string2 + stop - size1;
3715 }
3716
3717 /* `p' scans through the pattern as `d' scans through the data.
3718 `dend' is the end of the input string that `d' points within. `d'
3719 is advanced into the following input string whenever necessary, but
3720 this happens before fetching; therefore, at the beginning of the
3721 loop, `d' can be pointing at the end of a string, but it cannot
3722 equal `string2'. */
3723 if (size1 > 0 && pos <= size1)
3724 {
3725 d = string1 + pos;
3726 dend = end_match_1;
3727 }
3728 else
3729 {
3730 d = string2 + pos - size1;
3731 dend = end_match_2;
3732 }
3733
3734 DEBUG_PRINT1 ("The compiled pattern is: ");
3735 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3736 DEBUG_PRINT1 ("The string to match is: `");
3737 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3738 DEBUG_PRINT1 ("'\n");
3739
3740 /* This loops over pattern commands. It exits by returning from the
3741 function if the match is complete, or it drops through if the match
3742 fails at this starting point in the input data. */
3743 for (;;)
3744 {
3745 DEBUG_PRINT2 ("\n0x%x: ", p);
3746
3747 if (p == pend)
3748 { /* End of pattern means we might have succeeded. */
3749 DEBUG_PRINT1 ("end of pattern ... ");
3750
3751 /* If we haven't matched the entire string, and we want the
3752 longest match, try backtracking. */
3753 if (d != end_match_2)
3754 {
3755 /* 1 if this match ends in the same string (string1 or string2)
3756 as the best previous match. */
3757 boolean same_str_p = (FIRST_STRING_P (match_end)
3758 == MATCHING_IN_FIRST_STRING);
3759 /* 1 if this match is the best seen so far. */
3760 boolean best_match_p;
3761
3762 /* AIX compiler got confused when this was combined
3763 with the previous declaration. */
3764 if (same_str_p)
3765 best_match_p = d > match_end;
3766 else
3767 best_match_p = !MATCHING_IN_FIRST_STRING;
3768
3769 DEBUG_PRINT1 ("backtracking.\n");
3770
3771 if (!FAIL_STACK_EMPTY ())
3772 { /* More failure points to try. */
3773
3774 /* If exceeds best match so far, save it. */
3775 if (!best_regs_set || best_match_p)
3776 {
3777 best_regs_set = true;
3778 match_end = d;
3779
3780 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3781
3782 for (mcnt = 1; mcnt < num_regs; mcnt++)
3783 {
3784 best_regstart[mcnt] = regstart[mcnt];
3785 best_regend[mcnt] = regend[mcnt];
3786 }
3787 }
3788 goto fail;
3789 }
3790
3791 /* If no failure points, don't restore garbage. And if
3792 last match is real best match, don't restore second
3793 best one. */
3794 else if (best_regs_set && !best_match_p)
3795 {
3796 restore_best_regs:
3797 /* Restore best match. It may happen that `dend ==
3798 end_match_1' while the restored d is in string2.
3799 For example, the pattern `x.*y.*z' against the
3800 strings `x-' and `y-z-', if the two strings are
3801 not consecutive in memory. */
3802 DEBUG_PRINT1 ("Restoring best registers.\n");
3803
3804 d = match_end;
3805 dend = ((d >= string1 && d <= end1)
3806 ? end_match_1 : end_match_2);
3807
3808 for (mcnt = 1; mcnt < num_regs; mcnt++)
3809 {
3810 regstart[mcnt] = best_regstart[mcnt];
3811 regend[mcnt] = best_regend[mcnt];
3812 }
3813 }
3814 } /* d != end_match_2 */
3815
3816 succeed_label:
3817 DEBUG_PRINT1 ("Accepting match.\n");
3818
3819 /* If caller wants register contents data back, do it. */
3820 if (regs && !bufp->no_sub)
3821 {
3822 /* Have the register data arrays been allocated? */
3823 if (bufp->regs_allocated == REGS_UNALLOCATED)
3824 { /* No. So allocate them with malloc. We need one
3825 extra element beyond `num_regs' for the `-1' marker
3826 GNU code uses. */
3827 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3828 regs->start = TALLOC (regs->num_regs, regoff_t);
3829 regs->end = TALLOC (regs->num_regs, regoff_t);
3830 if (regs->start == NULL || regs->end == NULL)
3831 {
3832 FREE_VARIABLES ();
3833 return -2;
3834 }
3835 bufp->regs_allocated = REGS_REALLOCATE;
3836 }
3837 else if (bufp->regs_allocated == REGS_REALLOCATE)
3838 { /* Yes. If we need more elements than were already
3839 allocated, reallocate them. If we need fewer, just
3840 leave it alone. */
3841 if (regs->num_regs < num_regs + 1)
3842 {
3843 regs->num_regs = num_regs + 1;
3844 RETALLOC (regs->start, regs->num_regs, regoff_t);
3845 RETALLOC (regs->end, regs->num_regs, regoff_t);
3846 if (regs->start == NULL || regs->end == NULL)
3847 {
3848 FREE_VARIABLES ();
3849 return -2;
3850 }
3851 }
3852 }
3853 else
3854 {
3855 /* These braces fend off a "empty body in an else-statement"
3856 warning under GCC when assert expands to nothing. */
3857 assert (bufp->regs_allocated == REGS_FIXED);
3858 }
3859
3860 /* Convert the pointer data in `regstart' and `regend' to
3861 indices. Register zero has to be set differently,
3862 since we haven't kept track of any info for it. */
3863 if (regs->num_regs > 0)
3864 {
3865 regs->start[0] = pos;
3866 regs->end[0] = (MATCHING_IN_FIRST_STRING
3867 ? ((regoff_t) (d - string1))
3868 : ((regoff_t) (d - string2 + size1)));
3869 }
3870
3871 /* Go through the first `min (num_regs, regs->num_regs)'
3872 registers, since that is all we initialized. */
3873 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3874 {
3875 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3876 regs->start[mcnt] = regs->end[mcnt] = -1;
3877 else
3878 {
3879 regs->start[mcnt]
3880 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
3881 regs->end[mcnt]
3882 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
3883 }
3884 }
3885
3886 /* If the regs structure we return has more elements than
3887 were in the pattern, set the extra elements to -1. If
3888 we (re)allocated the registers, this is the case,
3889 because we always allocate enough to have at least one
3890 -1 at the end. */
3891 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3892 regs->start[mcnt] = regs->end[mcnt] = -1;
3893 } /* regs && !bufp->no_sub */
3894
3895 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3896 nfailure_points_pushed, nfailure_points_popped,
3897 nfailure_points_pushed - nfailure_points_popped);
3898 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3899
3900 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3901 ? string1
3902 : string2 - size1);
3903
3904 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3905
3906 FREE_VARIABLES ();
3907 return mcnt;
3908 }
3909
3910 /* Otherwise match next pattern command. */
3911 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3912 {
3913 /* Ignore these. Used to ignore the n of succeed_n's which
3914 currently have n == 0. */
3915 case no_op:
3916 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3917 break;
3918
3919 case succeed:
3920 DEBUG_PRINT1 ("EXECUTING succeed.\n");
3921 goto succeed_label;
3922
3923 /* Match the next n pattern characters exactly. The following
3924 byte in the pattern defines n, and the n bytes after that
3925 are the characters to match. */
3926 case exactn:
3927 mcnt = *p++;
3928 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3929
3930 /* This is written out as an if-else so we don't waste time
3931 testing `translate' inside the loop. */
3932 if (translate)
3933 {
3934 do
3935 {
3936 PREFETCH ();
3937 if (translate[(unsigned char) *d++] != (char) *p++)
3938 goto fail;
3939 }
3940 while (--mcnt);
3941 }
3942 else
3943 {
3944 do
3945 {
3946 PREFETCH ();
3947 if (*d++ != (char) *p++) goto fail;
3948 }
3949 while (--mcnt);
3950 }
3951 SET_REGS_MATCHED ();
3952 break;
3953
3954
3955 /* Match any character except possibly a newline or a null. */
3956 case anychar:
3957 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3958
3959 PREFETCH ();
3960
3961 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3962 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3963 goto fail;
3964
3965 SET_REGS_MATCHED ();
3966 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3967 d++;
3968 break;
3969
3970
3971 case charset:
3972 case charset_not:
3973 {
3974 register unsigned char c;
3975 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3976
3977 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3978
3979 PREFETCH ();
3980 c = TRANSLATE (*d); /* The character to match. */
3981
3982 /* Cast to `unsigned' instead of `unsigned char' in case the
3983 bit list is a full 32 bytes long. */
3984 if (c < (unsigned) (*p * BYTEWIDTH)
3985 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3986 not = !not;
3987
3988 p += 1 + *p;
3989
3990 if (!not) goto fail;
3991
3992 SET_REGS_MATCHED ();
3993 d++;
3994 break;
3995 }
3996
3997
3998 /* The beginning of a group is represented by start_memory.
3999 The arguments are the register number in the next byte, and the
4000 number of groups inner to this one in the next. The text
4001 matched within the group is recorded (in the internal
4002 registers data structure) under the register number. */
4003 case start_memory:
4004 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4005
4006 /* Find out if this group can match the empty string. */
4007 p1 = p; /* To send to group_match_null_string_p. */
4008
4009 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4010 REG_MATCH_NULL_STRING_P (reg_info[*p])
4011 = group_match_null_string_p (&p1, pend, reg_info);
4012
4013 /* Save the position in the string where we were the last time
4014 we were at this open-group operator in case the group is
4015 operated upon by a repetition operator, e.g., with `(a*)*b'
4016 against `ab'; then we want to ignore where we are now in
4017 the string in case this attempt to match fails. */
4018 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4019 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4020 : regstart[*p];
4021 DEBUG_PRINT2 (" old_regstart: %d\n",
4022 POINTER_TO_OFFSET (old_regstart[*p]));
4023
4024 regstart[*p] = d;
4025 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4026
4027 IS_ACTIVE (reg_info[*p]) = 1;
4028 MATCHED_SOMETHING (reg_info[*p]) = 0;
4029
4030 /* Clear this whenever we change the register activity status. */
4031 set_regs_matched_done = 0;
4032
4033 /* This is the new highest active register. */
4034 highest_active_reg = *p;
4035
4036 /* If nothing was active before, this is the new lowest active
4037 register. */
4038 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4039 lowest_active_reg = *p;
4040
4041 /* Move past the register number and inner group count. */
4042 p += 2;
4043 just_past_start_mem = p;
4044
4045 break;
4046
4047
4048 /* The stop_memory opcode represents the end of a group. Its
4049 arguments are the same as start_memory's: the register
4050 number, and the number of inner groups. */
4051 case stop_memory:
4052 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4053
4054 /* We need to save the string position the last time we were at
4055 this close-group operator in case the group is operated
4056 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4057 against `aba'; then we want to ignore where we are now in
4058 the string in case this attempt to match fails. */
4059 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4060 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4061 : regend[*p];
4062 DEBUG_PRINT2 (" old_regend: %d\n",
4063 POINTER_TO_OFFSET (old_regend[*p]));
4064
4065 regend[*p] = d;
4066 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4067
4068 /* This register isn't active anymore. */
4069 IS_ACTIVE (reg_info[*p]) = 0;
4070
4071 /* Clear this whenever we change the register activity status. */
4072 set_regs_matched_done = 0;
4073
4074 /* If this was the only register active, nothing is active
4075 anymore. */
4076 if (lowest_active_reg == highest_active_reg)
4077 {
4078 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4079 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4080 }
4081 else
4082 { /* We must scan for the new highest active register, since
4083 it isn't necessarily one less than now: consider
4084 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4085 new highest active register is 1. */
4086 unsigned char r = *p - 1;
4087 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4088 r--;
4089
4090 /* If we end up at register zero, that means that we saved
4091 the registers as the result of an `on_failure_jump', not
4092 a `start_memory', and we jumped to past the innermost
4093 `stop_memory'. For example, in ((.)*) we save
4094 registers 1 and 2 as a result of the *, but when we pop
4095 back to the second ), we are at the stop_memory 1.
4096 Thus, nothing is active. */
4097 if (r == 0)
4098 {
4099 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4100 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4101 }
4102 else
4103 highest_active_reg = r;
4104 }
4105
4106 /* If just failed to match something this time around with a
4107 group that's operated on by a repetition operator, try to
4108 force exit from the ``loop'', and restore the register
4109 information for this group that we had before trying this
4110 last match. */
4111 if ((!MATCHED_SOMETHING (reg_info[*p])
4112 || just_past_start_mem == p - 1)
4113 && (p + 2) < pend)
4114 {
4115 boolean is_a_jump_n = false;
4116
4117 p1 = p + 2;
4118 mcnt = 0;
4119 switch ((re_opcode_t) *p1++)
4120 {
4121 case jump_n:
4122 is_a_jump_n = true;
4123 case pop_failure_jump:
4124 case maybe_pop_jump:
4125 case jump:
4126 case dummy_failure_jump:
4127 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4128 if (is_a_jump_n)
4129 p1 += 2;
4130 break;
4131
4132 default:
4133 /* do nothing */ ;
4134 }
4135 p1 += mcnt;
4136
4137 /* If the next operation is a jump backwards in the pattern
4138 to an on_failure_jump right before the start_memory
4139 corresponding to this stop_memory, exit from the loop
4140 by forcing a failure after pushing on the stack the
4141 on_failure_jump's jump in the pattern, and d. */
4142 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4143 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4144 {
4145 /* If this group ever matched anything, then restore
4146 what its registers were before trying this last
4147 failed match, e.g., with `(a*)*b' against `ab' for
4148 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4149 against `aba' for regend[3].
4150
4151 Also restore the registers for inner groups for,
4152 e.g., `((a*)(b*))*' against `aba' (register 3 would
4153 otherwise get trashed). */
4154
4155 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4156 {
4157 unsigned r;
4158
4159 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4160
4161 /* Restore this and inner groups' (if any) registers. */
4162 for (r = *p; r < *p + *(p + 1); r++)
4163 {
4164 regstart[r] = old_regstart[r];
4165
4166 /* xx why this test? */
4167 if (old_regend[r] >= regstart[r])
4168 regend[r] = old_regend[r];
4169 }
4170 }
4171 p1++;
4172 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4173 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4174
4175 goto fail;
4176 }
4177 }
4178
4179 /* Move past the register number and the inner group count. */
4180 p += 2;
4181 break;
4182
4183
4184 /* \<digit> has been turned into a `duplicate' command which is
4185 followed by the numeric value of <digit> as the register number. */
4186 case duplicate:
4187 {
4188 register const char *d2, *dend2;
4189 int regno = *p++; /* Get which register to match against. */
4190 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4191
4192 /* Can't back reference a group which we've never matched. */
4193 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4194 goto fail;
4195
4196 /* Where in input to try to start matching. */
4197 d2 = regstart[regno];
4198
4199 /* Where to stop matching; if both the place to start and
4200 the place to stop matching are in the same string, then
4201 set to the place to stop, otherwise, for now have to use
4202 the end of the first string. */
4203
4204 dend2 = ((FIRST_STRING_P (regstart[regno])
4205 == FIRST_STRING_P (regend[regno]))
4206 ? regend[regno] : end_match_1);
4207 for (;;)
4208 {
4209 /* If necessary, advance to next segment in register
4210 contents. */
4211 while (d2 == dend2)
4212 {
4213 if (dend2 == end_match_2) break;
4214 if (dend2 == regend[regno]) break;
4215
4216 /* End of string1 => advance to string2. */
4217 d2 = string2;
4218 dend2 = regend[regno];
4219 }
4220 /* At end of register contents => success */
4221 if (d2 == dend2) break;
4222
4223 /* If necessary, advance to next segment in data. */
4224 PREFETCH ();
4225
4226 /* How many characters left in this segment to match. */
4227 mcnt = dend - d;
4228
4229 /* Want how many consecutive characters we can match in
4230 one shot, so, if necessary, adjust the count. */
4231 if (mcnt > dend2 - d2)
4232 mcnt = dend2 - d2;
4233
4234 /* Compare that many; failure if mismatch, else move
4235 past them. */
4236 if (translate
4237 ? bcmp_translate (d, d2, mcnt, translate)
4238 : bcmp (d, d2, mcnt))
4239 goto fail;
4240 d += mcnt, d2 += mcnt;
4241
4242 /* Do this because we've match some characters. */
4243 SET_REGS_MATCHED ();
4244 }
4245 }
4246 break;
4247
4248
4249 /* begline matches the empty string at the beginning of the string
4250 (unless `not_bol' is set in `bufp'), and, if
4251 `newline_anchor' is set, after newlines. */
4252 case begline:
4253 DEBUG_PRINT1 ("EXECUTING begline.\n");
4254
4255 if (AT_STRINGS_BEG (d))
4256 {
4257 if (!bufp->not_bol) break;
4258 }
4259 else if (d[-1] == '\n' && bufp->newline_anchor)
4260 {
4261 break;
4262 }
4263 /* In all other cases, we fail. */
4264 goto fail;
4265
4266
4267 /* endline is the dual of begline. */
4268 case endline:
4269 DEBUG_PRINT1 ("EXECUTING endline.\n");
4270
4271 if (AT_STRINGS_END (d))
4272 {
4273 if (!bufp->not_eol) break;
4274 }
4275
4276 /* We have to ``prefetch'' the next character. */
4277 else if ((d == end1 ? *string2 : *d) == '\n'
4278 && bufp->newline_anchor)
4279 {
4280 break;
4281 }
4282 goto fail;
4283
4284
4285 /* Match at the very beginning of the data. */
4286 case begbuf:
4287 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4288 if (AT_STRINGS_BEG (d))
4289 break;
4290 goto fail;
4291
4292
4293 /* Match at the very end of the data. */
4294 case endbuf:
4295 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4296 if (AT_STRINGS_END (d))
4297 break;
4298 goto fail;
4299
4300
4301 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4302 pushes NULL as the value for the string on the stack. Then
4303 `pop_failure_point' will keep the current value for the
4304 string, instead of restoring it. To see why, consider
4305 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4306 then the . fails against the \n. But the next thing we want
4307 to do is match the \n against the \n; if we restored the
4308 string value, we would be back at the foo.
4309
4310 Because this is used only in specific cases, we don't need to
4311 check all the things that `on_failure_jump' does, to make
4312 sure the right things get saved on the stack. Hence we don't
4313 share its code. The only reason to push anything on the
4314 stack at all is that otherwise we would have to change
4315 `anychar's code to do something besides goto fail in this
4316 case; that seems worse than this. */
4317 case on_failure_keep_string_jump:
4318 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4319
4320 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4321 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4322
4323 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4324 break;
4325
4326
4327 /* Uses of on_failure_jump:
4328
4329 Each alternative starts with an on_failure_jump that points
4330 to the beginning of the next alternative. Each alternative
4331 except the last ends with a jump that in effect jumps past
4332 the rest of the alternatives. (They really jump to the
4333 ending jump of the following alternative, because tensioning
4334 these jumps is a hassle.)
4335
4336 Repeats start with an on_failure_jump that points past both
4337 the repetition text and either the following jump or
4338 pop_failure_jump back to this on_failure_jump. */
4339 case on_failure_jump:
4340 on_failure:
4341 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4342
4343 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4344 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4345
4346 /* If this on_failure_jump comes right before a group (i.e.,
4347 the original * applied to a group), save the information
4348 for that group and all inner ones, so that if we fail back
4349 to this point, the group's information will be correct.
4350 For example, in \(a*\)*\1, we need the preceding group,
4351 and in \(\(a*\)b*\)\2, we need the inner group. */
4352
4353 /* We can't use `p' to check ahead because we push
4354 a failure point to `p + mcnt' after we do this. */
4355 p1 = p;
4356
4357 /* We need to skip no_op's before we look for the
4358 start_memory in case this on_failure_jump is happening as
4359 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4360 against aba. */
4361 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4362 p1++;
4363
4364 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4365 {
4366 /* We have a new highest active register now. This will
4367 get reset at the start_memory we are about to get to,
4368 but we will have saved all the registers relevant to
4369 this repetition op, as described above. */
4370 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4371 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4372 lowest_active_reg = *(p1 + 1);
4373 }
4374
4375 DEBUG_PRINT1 (":\n");
4376 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4377 break;
4378
4379
4380 /* A smart repeat ends with `maybe_pop_jump'.
4381 We change it to either `pop_failure_jump' or `jump'. */
4382 case maybe_pop_jump:
4383 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4384 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4385 {
4386 register unsigned char *p2 = p;
4387
4388 /* Compare the beginning of the repeat with what in the
4389 pattern follows its end. If we can establish that there
4390 is nothing that they would both match, i.e., that we
4391 would have to backtrack because of (as in, e.g., `a*a')
4392 then we can change to pop_failure_jump, because we'll
4393 never have to backtrack.
4394
4395 This is not true in the case of alternatives: in
4396 `(a|ab)*' we do need to backtrack to the `ab' alternative
4397 (e.g., if the string was `ab'). But instead of trying to
4398 detect that here, the alternative has put on a dummy
4399 failure point which is what we will end up popping. */
4400
4401 /* Skip over open/close-group commands.
4402 If what follows this loop is a ...+ construct,
4403 look at what begins its body, since we will have to
4404 match at least one of that. */
4405 while (1)
4406 {
4407 if (p2 + 2 < pend
4408 && ((re_opcode_t) *p2 == stop_memory
4409 || (re_opcode_t) *p2 == start_memory))
4410 p2 += 3;
4411 else if (p2 + 6 < pend
4412 && (re_opcode_t) *p2 == dummy_failure_jump)
4413 p2 += 6;
4414 else
4415 break;
4416 }
4417
4418 p1 = p + mcnt;
4419 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4420 to the `maybe_finalize_jump' of this case. Examine what
4421 follows. */
4422
4423 /* If we're at the end of the pattern, we can change. */
4424 if (p2 == pend)
4425 {
4426 /* Consider what happens when matching ":\(.*\)"
4427 against ":/". I don't really understand this code
4428 yet. */
4429 p[-3] = (unsigned char) pop_failure_jump;
4430 DEBUG_PRINT1
4431 (" End of pattern: change to `pop_failure_jump'.\n");
4432 }
4433
4434 else if ((re_opcode_t) *p2 == exactn
4435 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4436 {
4437 register unsigned char c
4438 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4439
4440 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4441 {
4442 p[-3] = (unsigned char) pop_failure_jump;
4443 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4444 c, p1[5]);
4445 }
4446
4447 else if ((re_opcode_t) p1[3] == charset
4448 || (re_opcode_t) p1[3] == charset_not)
4449 {
4450 int not = (re_opcode_t) p1[3] == charset_not;
4451
4452 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4453 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4454 not = !not;
4455
4456 /* `not' is equal to 1 if c would match, which means
4457 that we can't change to pop_failure_jump. */
4458 if (!not)
4459 {
4460 p[-3] = (unsigned char) pop_failure_jump;
4461 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4462 }
4463 }
4464 }
4465 else if ((re_opcode_t) *p2 == charset)
4466 {
4467#ifdef DEBUG
4468 register unsigned char c
4469 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4470#endif
4471
4472 if ((re_opcode_t) p1[3] == exactn
4473 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4474 && (p2[1 + p1[4] / BYTEWIDTH]
4475 & (1 << (p1[4] % BYTEWIDTH)))))
4476 {
4477 p[-3] = (unsigned char) pop_failure_jump;
4478 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4479 c, p1[5]);
4480 }
4481
4482 else if ((re_opcode_t) p1[3] == charset_not)
4483 {
4484 int idx;
4485 /* We win if the charset_not inside the loop
4486 lists every character listed in the charset after. */
4487 for (idx = 0; idx < (int) p2[1]; idx++)
4488 if (! (p2[2 + idx] == 0
4489 || (idx < (int) p1[4]
4490 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4491 break;
4492
4493 if (idx == p2[1])
4494 {
4495 p[-3] = (unsigned char) pop_failure_jump;
4496 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4497 }
4498 }
4499 else if ((re_opcode_t) p1[3] == charset)
4500 {
4501 int idx;
4502 /* We win if the charset inside the loop
4503 has no overlap with the one after the loop. */
4504 for (idx = 0;
4505 idx < (int) p2[1] && idx < (int) p1[4];
4506 idx++)
4507 if ((p2[2 + idx] & p1[5 + idx]) != 0)
4508 break;
4509
4510 if (idx == p2[1] || idx == p1[4])
4511 {
4512 p[-3] = (unsigned char) pop_failure_jump;
4513 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4514 }
4515 }
4516 }
4517 }
4518 p -= 2; /* Point at relative address again. */
4519 if ((re_opcode_t) p[-1] != pop_failure_jump)
4520 {
4521 p[-1] = (unsigned char) jump;
4522 DEBUG_PRINT1 (" Match => jump.\n");
4523 goto unconditional_jump;
4524 }
4525 /* Note fall through. */
4526
4527
4528 /* The end of a simple repeat has a pop_failure_jump back to
4529 its matching on_failure_jump, where the latter will push a
4530 failure point. The pop_failure_jump takes off failure
4531 points put on by this pop_failure_jump's matching
4532 on_failure_jump; we got through the pattern to here from the
4533 matching on_failure_jump, so didn't fail. */
4534 case pop_failure_jump:
4535 {
4536 /* We need to pass separate storage for the lowest and
4537 highest registers, even though we don't care about the
4538 actual values. Otherwise, we will restore only one
4539 register from the stack, since lowest will == highest in
4540 `pop_failure_point'. */
4541 unsigned dummy_low_reg, dummy_high_reg;
4542 unsigned char *pdummy;
4543 const char *sdummy;
4544
4545 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4546 POP_FAILURE_POINT (sdummy, pdummy,
4547 dummy_low_reg, dummy_high_reg,
4548 reg_dummy, reg_dummy, reg_info_dummy);
4549 }
4550 /* Note fall through. */
4551
4552
4553 /* Unconditionally jump (without popping any failure points). */
4554 case jump:
4555 unconditional_jump:
4556 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4557 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4558 p += mcnt; /* Do the jump. */
4559 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4560 break;
4561
4562
4563 /* We need this opcode so we can detect where alternatives end
4564 in `group_match_null_string_p' et al. */
4565 case jump_past_alt:
4566 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4567 goto unconditional_jump;
4568
4569
4570 /* Normally, the on_failure_jump pushes a failure point, which
4571 then gets popped at pop_failure_jump. We will end up at
4572 pop_failure_jump, also, and with a pattern of, say, `a+', we
4573 are skipping over the on_failure_jump, so we have to push
4574 something meaningless for pop_failure_jump to pop. */
4575 case dummy_failure_jump:
4576 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4577 /* It doesn't matter what we push for the string here. What
4578 the code at `fail' tests is the value for the pattern. */
4579 PUSH_FAILURE_POINT (0, 0, -2);
4580 goto unconditional_jump;
4581
4582
4583 /* At the end of an alternative, we need to push a dummy failure
4584 point in case we are followed by a `pop_failure_jump', because
4585 we don't want the failure point for the alternative to be
4586 popped. For example, matching `(a|ab)*' against `aab'
4587 requires that we match the `ab' alternative. */
4588 case push_dummy_failure:
4589 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4590 /* See comments just above at `dummy_failure_jump' about the
4591 two zeroes. */
4592 PUSH_FAILURE_POINT (0, 0, -2);
4593 break;
4594
4595 /* Have to succeed matching what follows at least n times.
4596 After that, handle like `on_failure_jump'. */
4597 case succeed_n:
4598 EXTRACT_NUMBER (mcnt, p + 2);
4599 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4600
4601 assert (mcnt >= 0);
4602 /* Originally, this is how many times we HAVE to succeed. */
4603 if (mcnt > 0)
4604 {
4605 mcnt--;
4606 p += 2;
4607 STORE_NUMBER_AND_INCR (p, mcnt);
4608 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4609 }
4610 else if (mcnt == 0)
4611 {
4612 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4613 p[2] = (unsigned char) no_op;
4614 p[3] = (unsigned char) no_op;
4615 goto on_failure;
4616 }
4617 break;
4618
4619 case jump_n:
4620 EXTRACT_NUMBER (mcnt, p + 2);
4621 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4622
4623 /* Originally, this is how many times we CAN jump. */
4624 if (mcnt)
4625 {
4626 mcnt--;
4627 STORE_NUMBER (p + 2, mcnt);
4628 goto unconditional_jump;
4629 }
4630 /* If don't have to jump any more, skip over the rest of command. */
4631 else
4632 p += 4;
4633 break;
4634
4635 case set_number_at:
4636 {
4637 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4638
4639 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4640 p1 = p + mcnt;
4641 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4642 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4643 STORE_NUMBER (p1, mcnt);
4644 break;
4645 }
4646
4647 case wordbound:
4648 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4649 if (AT_WORD_BOUNDARY (d))
4650 break;
4651 goto fail;
4652
4653 case notwordbound:
4654 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4655 if (AT_WORD_BOUNDARY (d))
4656 goto fail;
4657 break;
4658
4659 case wordbeg:
4660 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4661 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4662 break;
4663 goto fail;
4664
4665 case wordend:
4666 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4667 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4668 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4669 break;
4670 goto fail;
4671
4672#ifdef emacs
4673 case before_dot:
4674 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4675 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4676 goto fail;
4677 break;
4678
4679 case at_dot:
4680 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4681 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4682 goto fail;
4683 break;
4684
4685 case after_dot:
4686 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4687 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4688 goto fail;
4689 break;
4690#if 0 /* not emacs19 */
4691 case at_dot:
4692 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4693 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4694 goto fail;
4695 break;
4696#endif /* not emacs19 */
4697
4698 case syntaxspec:
4699 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4700 mcnt = *p++;
4701 goto matchsyntax;
4702
4703 case wordchar:
4704 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4705 mcnt = (int) Sword;
4706 matchsyntax:
4707 PREFETCH ();
4708 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
4709 d++;
4710 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
4711 goto fail;
4712 SET_REGS_MATCHED ();
4713 break;
4714
4715 case notsyntaxspec:
4716 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4717 mcnt = *p++;
4718 goto matchnotsyntax;
4719
4720 case notwordchar:
4721 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4722 mcnt = (int) Sword;
4723 matchnotsyntax:
4724 PREFETCH ();
4725 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
4726 d++;
4727 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
4728 goto fail;
4729 SET_REGS_MATCHED ();
4730 break;
4731
4732#else /* not emacs */
4733 case wordchar:
4734 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4735 PREFETCH ();
4736 if (!WORDCHAR_P (d))
4737 goto fail;
4738 SET_REGS_MATCHED ();
4739 d++;
4740 break;
4741
4742 case notwordchar:
4743 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4744 PREFETCH ();
4745 if (WORDCHAR_P (d))
4746 goto fail;
4747 SET_REGS_MATCHED ();
4748 d++;
4749 break;
4750#endif /* not emacs */
4751
4752 default:
4753 abort ();
4754 }
4755 continue; /* Successfully executed one pattern command; keep going. */
4756
4757
4758 /* We goto here if a matching operation fails. */
4759 fail:
4760 if (!FAIL_STACK_EMPTY ())
4761 { /* A restart point is known. Restore to that state. */
4762 DEBUG_PRINT1 ("\nFAIL:\n");
4763 POP_FAILURE_POINT (d, p,
4764 lowest_active_reg, highest_active_reg,
4765 regstart, regend, reg_info);
4766
4767 /* If this failure point is a dummy, try the next one. */
4768 if (!p)
4769 goto fail;
4770
4771 /* If we failed to the end of the pattern, don't examine *p. */
4772 assert (p <= pend);
4773 if (p < pend)
4774 {
4775 boolean is_a_jump_n = false;
4776
4777 /* If failed to a backwards jump that's part of a repetition
4778 loop, need to pop this failure point and use the next one. */
4779 switch ((re_opcode_t) *p)
4780 {
4781 case jump_n:
4782 is_a_jump_n = true;
4783 case maybe_pop_jump:
4784 case pop_failure_jump:
4785 case jump:
4786 p1 = p + 1;
4787 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4788 p1 += mcnt;
4789
4790 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4791 || (!is_a_jump_n
4792 && (re_opcode_t) *p1 == on_failure_jump))
4793 goto fail;
4794 break;
4795 default:
4796 /* do nothing */ ;
4797 }
4798 }
4799
4800 if (d >= string1 && d <= end1)
4801 dend = end_match_1;
4802 }
4803 else
4804 break; /* Matching at this starting point really fails. */
4805 } /* for (;;) */
4806
4807 if (best_regs_set)
4808 goto restore_best_regs;
4809
4810 FREE_VARIABLES ();
4811
4812 return -1; /* Failure to match. */
4813} /* re_match_2 */
4814
4815
4816/* Subroutine definitions for re_match_2. */
4817
4818
4819/* We are passed P pointing to a register number after a start_memory.
4820
4821 Return true if the pattern up to the corresponding stop_memory can
4822 match the empty string, and false otherwise.
4823
4824 If we find the matching stop_memory, sets P to point to one past its number.
4825 Otherwise, sets P to an undefined byte less than or equal to END.
4826
4827 We don't handle duplicates properly (yet). */
4828
4829static boolean
4830group_match_null_string_p (p, end, reg_info)
4831 unsigned char **p, *end;
4832 register_info_type *reg_info;
4833{
4834 int mcnt;
4835 /* Point to after the args to the start_memory. */
4836 unsigned char *p1 = *p + 2;
4837
4838 while (p1 < end)
4839 {
4840 /* Skip over opcodes that can match nothing, and return true or
4841 false, as appropriate, when we get to one that can't, or to the
4842 matching stop_memory. */
4843
4844 switch ((re_opcode_t) *p1)
4845 {
4846 /* Could be either a loop or a series of alternatives. */
4847 case on_failure_jump:
4848 p1++;
4849 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4850
4851 /* If the next operation is not a jump backwards in the
4852 pattern. */
4853
4854 if (mcnt >= 0)
4855 {
4856 /* Go through the on_failure_jumps of the alternatives,
4857 seeing if any of the alternatives cannot match nothing.
4858 The last alternative starts with only a jump,
4859 whereas the rest start with on_failure_jump and end
4860 with a jump, e.g., here is the pattern for `a|b|c':
4861
4862 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4863 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4864 /exactn/1/c
4865
4866 So, we have to first go through the first (n-1)
4867 alternatives and then deal with the last one separately. */
4868
4869
4870 /* Deal with the first (n-1) alternatives, which start
4871 with an on_failure_jump (see above) that jumps to right
4872 past a jump_past_alt. */
4873
4874 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4875 {
4876 /* `mcnt' holds how many bytes long the alternative
4877 is, including the ending `jump_past_alt' and
4878 its number. */
4879
4880 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4881 reg_info))
4882 return false;
4883
4884 /* Move to right after this alternative, including the
4885 jump_past_alt. */
4886 p1 += mcnt;
4887
4888 /* Break if it's the beginning of an n-th alternative
4889 that doesn't begin with an on_failure_jump. */
4890 if ((re_opcode_t) *p1 != on_failure_jump)
4891 break;
4892
4893 /* Still have to check that it's not an n-th
4894 alternative that starts with an on_failure_jump. */
4895 p1++;
4896 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4897 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4898 {
4899 /* Get to the beginning of the n-th alternative. */
4900 p1 -= 3;
4901 break;
4902 }
4903 }
4904
4905 /* Deal with the last alternative: go back and get number
4906 of the `jump_past_alt' just before it. `mcnt' contains
4907 the length of the alternative. */
4908 EXTRACT_NUMBER (mcnt, p1 - 2);
4909
4910 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4911 return false;
4912
4913 p1 += mcnt; /* Get past the n-th alternative. */
4914 } /* if mcnt > 0 */
4915 break;
4916
4917
4918 case stop_memory:
4919 assert (p1[1] == **p);
4920 *p = p1 + 2;
4921 return true;
4922
4923
4924 default:
4925 if (!common_op_match_null_string_p (&p1, end, reg_info))
4926 return false;
4927 }
4928 } /* while p1 < end */
4929
4930 return false;
4931} /* group_match_null_string_p */
4932
4933
4934/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4935 It expects P to be the first byte of a single alternative and END one
4936 byte past the last. The alternative can contain groups. */
4937
4938static boolean
4939alt_match_null_string_p (p, end, reg_info)
4940 unsigned char *p, *end;
4941 register_info_type *reg_info;
4942{
4943 int mcnt;
4944 unsigned char *p1 = p;
4945
4946 while (p1 < end)
4947 {
4948 /* Skip over opcodes that can match nothing, and break when we get
4949 to one that can't. */
4950
4951 switch ((re_opcode_t) *p1)
4952 {
4953 /* It's a loop. */
4954 case on_failure_jump:
4955 p1++;
4956 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4957 p1 += mcnt;
4958 break;
4959
4960 default:
4961 if (!common_op_match_null_string_p (&p1, end, reg_info))
4962 return false;
4963 }
4964 } /* while p1 < end */
4965
4966 return true;
4967} /* alt_match_null_string_p */
4968
4969
4970/* Deals with the ops common to group_match_null_string_p and
4971 alt_match_null_string_p.
4972
4973 Sets P to one after the op and its arguments, if any. */
4974
4975static boolean
4976common_op_match_null_string_p (p, end, reg_info)
4977 unsigned char **p, *end;
4978 register_info_type *reg_info;
4979{
4980 int mcnt;
4981 boolean ret;
4982 int reg_no;
4983 unsigned char *p1 = *p;
4984
4985 switch ((re_opcode_t) *p1++)
4986 {
4987 case no_op:
4988 case begline:
4989 case endline:
4990 case begbuf:
4991 case endbuf:
4992 case wordbeg:
4993 case wordend:
4994 case wordbound:
4995 case notwordbound:
4996#ifdef emacs
4997 case before_dot:
4998 case at_dot:
4999 case after_dot:
5000#endif
5001 break;
5002
5003 case start_memory:
5004 reg_no = *p1;
5005 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5006 ret = group_match_null_string_p (&p1, end, reg_info);
5007
5008 /* Have to set this here in case we're checking a group which
5009 contains a group and a back reference to it. */
5010
5011 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5012 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5013
5014 if (!ret)
5015 return false;
5016 break;
5017
5018 /* If this is an optimized succeed_n for zero times, make the jump. */
5019 case jump:
5020 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5021 if (mcnt >= 0)
5022 p1 += mcnt;
5023 else
5024 return false;
5025 break;
5026
5027 case succeed_n:
5028 /* Get to the number of times to succeed. */
5029 p1 += 2;
5030 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5031
5032 if (mcnt == 0)
5033 {
5034 p1 -= 4;
5035 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5036 p1 += mcnt;
5037 }
5038 else
5039 return false;
5040 break;
5041
5042 case duplicate:
5043 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5044 return false;
5045 break;
5046
5047 case set_number_at:
5048 p1 += 4;
5049
5050 default:
5051 /* All other opcodes mean we cannot match the empty string. */
5052 return false;
5053 }
5054
5055 *p = p1;
5056 return true;
5057} /* common_op_match_null_string_p */
5058
5059
5060/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5061 bytes; nonzero otherwise. */
5062
5063static int
5064bcmp_translate (s1, s2, len, translate)
5065 unsigned char *s1, *s2;
5066 register int len;
5067 char *translate;
5068{
5069 register unsigned char *p1 = s1, *p2 = s2;
5070 while (len)
5071 {
5072 if (translate[*p1++] != translate[*p2++]) return 1;
5073 len--;
5074 }
5075 return 0;
5076}
5077
5078
5079/* Entry points for GNU code. */
5080
5081/* re_compile_pattern is the GNU regular expression compiler: it
5082 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5083 Returns 0 if the pattern was valid, otherwise an error string.
5084
5085 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5086 are set in BUFP on entry.
5087
5088 We call regex_compile to do the actual compilation. */
5089
5090const char *
5091re_compile_pattern (pattern, length, bufp)
5092 const char *pattern;
5093 int length;
5094 struct re_pattern_buffer *bufp;
5095{
5096 reg_errcode_t ret;
5097
5098 /* GNU code is written to assume at least RE_NREGS registers will be set
5099 (and at least one extra will be -1). */
5100 bufp->regs_allocated = REGS_UNALLOCATED;
5101
5102 /* And GNU code determines whether or not to get register information
5103 by passing null for the REGS argument to re_match, etc., not by
5104 setting no_sub. */
5105 bufp->no_sub = 0;
5106
5107 /* Match anchors at newline. */
5108 bufp->newline_anchor = 1;
5109
5110 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5111
5112 if (!ret)
5113 return NULL;
5114 return gettext (re_error_msgid[(int) ret]);
5115}
5116
5117
5118/* Entry points compatible with 4.2 BSD regex library. We don't define
5119 them unless specifically requested. */
5120
5121#ifdef _REGEX_RE_COMP
5122
5123/* BSD has one and only one pattern buffer. */
5124static struct re_pattern_buffer re_comp_buf;
5125
5126char *
5127re_comp (s)
5128 const char *s;
5129{
5130 reg_errcode_t ret;
5131
5132 if (!s)
5133 {
5134 if (!re_comp_buf.buffer)
5135 return gettext ("No previous regular expression");
5136 return 0;
5137 }
5138
5139 if (!re_comp_buf.buffer)
5140 {
5141 re_comp_buf.buffer = (unsigned char *) malloc (200);
5142 if (re_comp_buf.buffer == NULL)
5143 return gettext (re_error_msgid[(int) REG_ESPACE]);
5144 re_comp_buf.allocated = 200;
5145
5146 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5147 if (re_comp_buf.fastmap == NULL)
5148 return gettext (re_error_msgid[(int) REG_ESPACE]);
5149 }
5150
5151 /* Since `re_exec' always passes NULL for the `regs' argument, we
5152 don't need to initialize the pattern buffer fields which affect it. */
5153
5154 /* Match anchors at newlines. */
5155 re_comp_buf.newline_anchor = 1;
5156
5157 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5158
5159 if (!ret)
5160 return NULL;
5161
5162 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5163 return (char *) gettext (re_error_msgid[(int) ret]);
5164}
5165
5166
5167int
5168re_exec (s)
5169 const char *s;
5170{
5171 const int len = strlen (s);
5172 return
5173 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5174}
5175#endif /* _REGEX_RE_COMP */
5176
5177
5178/* POSIX.2 functions. Don't define these for Emacs. */
5179
5180#ifndef emacs
5181
5182/* regcomp takes a regular expression as a string and compiles it.
5183
5184 PREG is a regex_t *. We do not expect any fields to be initialized,
5185 since POSIX says we shouldn't. Thus, we set
5186
5187 `buffer' to the compiled pattern;
5188 `used' to the length of the compiled pattern;
5189 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5190 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5191 RE_SYNTAX_POSIX_BASIC;
5192 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5193 `fastmap' and `fastmap_accurate' to zero;
5194 `re_nsub' to the number of subexpressions in PATTERN.
5195
5196 PATTERN is the address of the pattern string.
5197
5198 CFLAGS is a series of bits which affect compilation.
5199
5200 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5201 use POSIX basic syntax.
5202
5203 If REG_NEWLINE is set, then . and [^...] don't match newline.
5204 Also, regexec will try a match beginning after every newline.
5205
5206 If REG_ICASE is set, then we considers upper- and lowercase
5207 versions of letters to be equivalent when matching.
5208
5209 If REG_NOSUB is set, then when PREG is passed to regexec, that
5210 routine will report only success or failure, and nothing about the
5211 registers.
5212
5213 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5214 the return codes and their meanings.) */
5215
5216int
5217regcomp (preg, pattern, cflags)
5218 regex_t *preg;
5219 const char *pattern;
5220 int cflags;
5221{
5222 reg_errcode_t ret;
5223 unsigned syntax
5224 = (cflags & REG_EXTENDED) ?
5225 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5226
5227 /* regex_compile will allocate the space for the compiled pattern. */
5228 preg->buffer = 0;
5229 preg->allocated = 0;
5230 preg->used = 0;
5231
5232 /* Don't bother to use a fastmap when searching. This simplifies the
5233 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5234 characters after newlines into the fastmap. This way, we just try
5235 every character. */
5236 preg->fastmap = 0;
5237
5238 if (cflags & REG_ICASE)
5239 {
5240 unsigned i;
5241
5242 preg->translate = (char *) malloc (CHAR_SET_SIZE);
5243 if (preg->translate == NULL)
5244 return (int) REG_ESPACE;
5245
5246 /* Map uppercase characters to corresponding lowercase ones. */
5247 for (i = 0; i < CHAR_SET_SIZE; i++)
5248 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5249 }
5250 else
5251 preg->translate = NULL;
5252
5253 /* If REG_NEWLINE is set, newlines are treated differently. */
5254 if (cflags & REG_NEWLINE)
5255 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5256 syntax &= ~RE_DOT_NEWLINE;
5257 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5258 /* It also changes the matching behavior. */
5259 preg->newline_anchor = 1;
5260 }
5261 else
5262 preg->newline_anchor = 0;
5263
5264 preg->no_sub = !!(cflags & REG_NOSUB);
5265
5266 /* POSIX says a null character in the pattern terminates it, so we
5267 can use strlen here in compiling the pattern. */
5268 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5269
5270 /* POSIX doesn't distinguish between an unmatched open-group and an
5271 unmatched close-group: both are REG_EPAREN. */
5272 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5273
5274 return (int) ret;
5275}
5276
5277
5278/* regexec searches for a given pattern, specified by PREG, in the
5279 string STRING.
5280
5281 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5282 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5283 least NMATCH elements, and we set them to the offsets of the
5284 corresponding matched substrings.
5285
5286 EFLAGS specifies `execution flags' which affect matching: if
5287 REG_NOTBOL is set, then ^ does not match at the beginning of the
5288 string; if REG_NOTEOL is set, then $ does not match at the end.
5289
5290 We return 0 if we find a match and REG_NOMATCH if not. */
5291
5292int
5293regexec (preg, string, nmatch, pmatch, eflags)
5294 const regex_t *preg;
5295 const char *string;
5296 size_t nmatch;
5297 regmatch_t pmatch[];
5298 int eflags;
5299{
5300 int ret;
5301 struct re_registers regs;
5302 regex_t private_preg;
5303 int len = strlen (string);
5304 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5305
5306 private_preg = *preg;
5307
5308 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5309 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5310
5311 /* The user has told us exactly how many registers to return
5312 information about, via `nmatch'. We have to pass that on to the
5313 matching routines. */
5314 private_preg.regs_allocated = REGS_FIXED;
5315
5316 if (want_reg_info)
5317 {
5318 regs.num_regs = nmatch;
5319 regs.start = TALLOC (nmatch, regoff_t);
5320 regs.end = TALLOC (nmatch, regoff_t);
5321 if (regs.start == NULL || regs.end == NULL)
5322 return (int) REG_NOMATCH;
5323 }
5324
5325 /* Perform the searching operation. */
5326 ret = re_search (&private_preg, string, len,
5327 /* start: */ 0, /* range: */ len,
5328 want_reg_info ? &regs : (struct re_registers *) 0);
5329
5330 /* Copy the register information to the POSIX structure. */
5331 if (want_reg_info)
5332 {
5333 if (ret >= 0)
5334 {
5335 unsigned r;
5336
5337 for (r = 0; r < nmatch; r++)
5338 {
5339 pmatch[r].rm_so = regs.start[r];
5340 pmatch[r].rm_eo = regs.end[r];
5341 }
5342 }
5343
5344 /* If we needed the temporary register info, free the space now. */
5345 free (regs.start);
5346 free (regs.end);
5347 }
5348
5349 /* We want zero return to mean success, unlike `re_search'. */
5350 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5351}
5352
5353
5354/* Returns a message corresponding to an error code, ERRCODE, returned
5355 from either regcomp or regexec. We don't use PREG here. */
5356
5357size_t
5358regerror (errcode, preg, errbuf, errbuf_size)
5359 int errcode;
5360 const regex_t *preg;
5361 char *errbuf;
5362 size_t errbuf_size;
5363{
5364 const char *msg;
5365 size_t msg_size;
5366
5367 if (errcode < 0
5368 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5369 /* Only error codes returned by the rest of the code should be passed
5370 to this routine. If we are given anything else, or if other regex
5371 code generates an invalid error code, then the program has a bug.
5372 Dump core so we can fix it. */
5373 abort ();
5374
5375 msg = gettext (re_error_msgid[errcode]);
5376
5377 msg_size = strlen (msg) + 1; /* Includes the null. */
5378
5379 if (errbuf_size != 0)
5380 {
5381 if (msg_size > errbuf_size)
5382 {
5383 strncpy (errbuf, msg, errbuf_size - 1);
5384 errbuf[errbuf_size - 1] = 0;
5385 }
5386 else
5387 strcpy (errbuf, msg);
5388 }
5389
5390 return msg_size;
5391}
5392
5393
5394/* Free dynamically allocated space used by PREG. */
5395
5396void
5397regfree (preg)
5398 regex_t *preg;
5399{
5400 if (preg->buffer != NULL)
5401 free (preg->buffer);
5402 preg->buffer = NULL;
5403
5404 preg->allocated = 0;
5405 preg->used = 0;
5406
5407 if (preg->fastmap != NULL)
5408 free (preg->fastmap);
5409 preg->fastmap = NULL;
5410 preg->fastmap_accurate = 0;
5411
5412 if (preg->translate != NULL)
5413 free (preg->translate);
5414 preg->translate = NULL;
5415}
5416
5417#endif /* not emacs */
5418
5419
5420/*
5421Local variables:
5422make-backup-files: t
5423version-control: t
5424trim-versions-without-asking: nil
5425End:
5426*/
Note: See TracBrowser for help on using the repository browser.