From 98dfb2dc8ea8c42483906b8ed5555f16c55ca369 Mon Sep 17 00:00:00 2001 From: Kate F Date: Wed, 23 Oct 2024 11:15:01 +0100 Subject: [PATCH 01/80] Missing documentation for -x --- man/rx.1/rx.1.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/man/rx.1/rx.1.xml b/man/rx.1/rx.1.xml index a9f1ffd18..27d358177 100644 --- a/man/rx.1/rx.1.xml +++ b/man/rx.1/rx.1.xml @@ -44,6 +44,7 @@ -v"> -w"> -X"> + -x"> -h"> ]> @@ -462,6 +463,17 @@ + + &x.opt; + + + Literals are unanchored. + This applies to all literals; for finer control use a regex dialect. + The default is that literals are anchored, + as if written ^abc$ in regex syntax. + + + &h.opt; From 280ca7c30da78b12925a67531dea3be48ed4fc3c Mon Sep 17 00:00:00 2001 From: Kate F Date: Wed, 23 Oct 2024 11:19:45 +0100 Subject: [PATCH 02/80] Too much documentation for -u. --- man/rx.1/rx.1.xml | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/man/rx.1/rx.1.xml b/man/rx.1/rx.1.xml index 27d358177..01956b47a 100644 --- a/man/rx.1/rx.1.xml +++ b/man/rx.1/rx.1.xml @@ -404,29 +404,20 @@ &u.opt; - Allow ambiguities. - This means patterns with different ids may match the same text. - The default is to error for conflicts. + Allow ambiguities between patterns. + This means patterns with different ids may match the same text. - It's possible to have multiple patterns with the same id - (i.e. by being in the same file when using multi-file mode), - and these are not considered a conflict because they key - to the same id. - - - - - &u.opt; - - - Allow ambiguities between regexps, - such that multiple regexps may match the same text. - The default is to error for ambiguities, + The default is to error for ambiguities, requiring all regexps unioned to be non-overlapping. Formally, the requirement is that they are disjoint languages. Erroring for ambiguities applies after multiple regexps are joined, either by union or by concatenation (&s.opt;). + It's possible to have multiple patterns with the same id + (i.e. by being in the same file when using multi-file mode), + and these are not considered a conflict because they key + to the same id. + &u.opt; is implied by &n.opt;. From 7ebc1ebf0745b7fdbaa05197650ed3a66b26adf5 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 7 Nov 2024 17:16:37 -0500 Subject: [PATCH 03/80] Fix memory leak. The edge sets leak when halting with FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED. --- src/libfsm/determinise.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index fc7c68ba4..d5ba396a4 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -237,6 +237,7 @@ fsm_determinise_with_config(struct fsm *nfa, assert(dfa->states[m->dfastate].edges == NULL); dfa->states[m->dfastate].edges = m->edges; + m->edges = NULL; /* transfer ownership */ /* * The current DFA state is an end state if any of its associated NFA @@ -593,6 +594,8 @@ map_free(struct map *map) if (b == NULL) { continue; } + /* free any edge sets that didn't get transferred */ + edge_set_free(map->alloc, b->edges); f_free(map->alloc, b); } From c9a240a62ce54b0b0cbf883612ffc6e199ca7f23 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 8 Nov 2024 16:22:57 -0500 Subject: [PATCH 04/80] Add a regression test for the memory leak. I verified that LeakSanitizer catches the leak during this test when the fix is reverted. This commit adds a test directory, `tests/regressions`, for misc. regression tests that need a .c file and aren't easily testable via fsm, re, etc. inputs. --- Makefile | 1 + tests/regressions/Makefile | 20 ++++++ ...regressions_determinise_state_limit_leak.c | 69 +++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 tests/regressions/Makefile create mode 100644 tests/regressions/regressions_determinise_state_limit_leak.c diff --git a/Makefile b/Makefile index 514f80bba..7075e432d 100644 --- a/Makefile +++ b/Makefile @@ -136,6 +136,7 @@ SUBDIR += tests/pcre-repeat SUBDIR += tests/pred SUBDIR += tests/re_literal SUBDIR += tests/re_strings +SUBDIR += tests/regressions SUBDIR += tests/reverse SUBDIR += tests/trim SUBDIR += tests/union diff --git a/tests/regressions/Makefile b/tests/regressions/Makefile new file mode 100644 index 000000000..624353ef6 --- /dev/null +++ b/tests/regressions/Makefile @@ -0,0 +1,20 @@ +.include "../../share/mk/top.mk" + +TEST.tests/regressions != ls -1 tests/regressions/regressions*.c +TEST_SRCDIR.tests/regressions = tests/regressions +TEST_OUTDIR.tests/regressions = ${BUILD}/tests/regressions + +.for n in ${TEST.tests/regressions:T:R:C/^regressions//} +INCDIR.${TEST_SRCDIR.tests/regressions}/regressions${n}.c += src/adt +.endfor + +.for n in ${TEST.tests/regressions:T:R:C/^regressions//} +test:: ${TEST_OUTDIR.tests/regressions}/res${n} +SRC += ${TEST_SRCDIR.tests/regressions}/regressions${n}.c +CFLAGS.${TEST_SRCDIR.tests/regressions}/regressions${n}.c += -UNDEBUG + +${TEST_OUTDIR.tests/regressions}/run${n}: ${TEST_OUTDIR.tests/regressions}/regressions${n}.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/regressions}/regressions${n}.c} -o ${TEST_OUTDIR.tests/regressions}/run${n} ${TEST_OUTDIR.tests/regressions}/regressions${n}.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a +${TEST_OUTDIR.tests/regressions}/res${n}: ${TEST_OUTDIR.tests/regressions}/run${n} + ( ${TEST_OUTDIR.tests/regressions}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/regressions}/res${n} +.endfor diff --git a/tests/regressions/regressions_determinise_state_limit_leak.c b/tests/regressions/regressions_determinise_state_limit_leak.c new file mode 100644 index 000000000..afdfbd9c5 --- /dev/null +++ b/tests/regressions/regressions_determinise_state_limit_leak.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include + +static const char *strings[] = { + [0] = "apple", + [1] = "banana", + [2] = "carrot", + [3] = "durian", + [4] = "eggplant", +}; +#define STRING_COUNT sizeof(strings)/sizeof(strings[0]) + +int main(void) +{ + struct fsm *fsms[STRING_COUNT] = {0}; + + for (size_t i = 0; i < STRING_COUNT; i++) { + fsms[i] = re_comp(RE_PCRE, fsm_sgetc, &strings[i], NULL, 0, NULL); + assert(fsms[i] != NULL); + } + + struct fsm *combined_fsm = fsm_union_array(STRING_COUNT, fsms, NULL); + assert(combined_fsm != NULL); + + size_t state_limit_base = fsm_countstates(combined_fsm); + size_t max_state_limit = state_limit_base + 100; + + bool hit_state_limit = false; + + for (size_t state_limit = state_limit_base; state_limit < max_state_limit; state_limit += 10) { + struct fsm *cp = fsm_clone(combined_fsm); + + const struct fsm_determinise_config det_config = { + .state_limit = state_limit, + }; + + /* Previously this would leak memory when hitting the STATE_LIMIT_REACHED + * early exit, because the edge sets for the DFA being constructed were + * not freed properly. + * + * The first time this should fail immediately because the state limit IS the starting size, + * but later on it should halt in the middle of construction. */ + switch (fsm_determinise_with_config(cp, &det_config)) { + case FSM_DETERMINISE_WITH_CONFIG_OK: + fsm_free(cp); + break; + case FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED: + hit_state_limit = true; + fsm_free(cp); + break; + case FSM_DETERMINISE_WITH_CONFIG_ERRNO: + assert(!"internal error"); + return EXIT_FAILURE; + } + } + + assert(hit_state_limit); + + fsm_free(combined_fsm); + return EXIT_SUCCESS; +} From 3572258a2688ed843ffc4648021d54d252a58779 Mon Sep 17 00:00:00 2001 From: Ricky Hosfelt Date: Tue, 11 Feb 2025 11:57:57 -0500 Subject: [PATCH 05/80] Update CI to use Ubuntu 22.04 for now so we do not get bleeding edge clang and fail builds --- .github/workflows/ci.yml | 59 +++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dfb6182f2..157506a14 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ env: jobs: checkout: name: "Checkout" - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Cache checkout @@ -44,7 +44,10 @@ jobs: pcre_suite: name: "Import PCRE suite" - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-22.04 ] needs: [ build ] # for cvtpcre steps: @@ -71,7 +74,7 @@ jobs: id: cache-cvtpcre with: path: ${{ env.cvtpcre }} - key: cvtpcre-bmake-ubuntu-gcc-DEBUG-AUSAN-${{ github.sha }}-${{ env.pcre2 }} + key: cvtpcre-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }}-${{ env.pcre2 }} - name: Fetch build if: steps.cache-cvtpcre.outputs.cache-hit != 'true' @@ -79,7 +82,7 @@ jobs: id: cache-build with: path: ${{ env.build }} - key: build-bmake-ubuntu-gcc-DEBUG-AUSAN-${{ github.sha }} # arbitary build, just for cvtpcre + key: build-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }} # arbitrary build, just for cvtpcre - name: Convert PCRE suite if: steps.cache-cvtpcre.outputs.cache-hit != 'true' @@ -130,14 +133,14 @@ jobs: build: name: "Build ${{ matrix.san }} ${{ matrix.cc }} ${{ matrix.os }} ${{ matrix.debug }}" - runs-on: ${{ matrix.os }}-latest + runs-on: ${{ matrix.os }} needs: [ checkout ] strategy: fail-fast: true matrix: san: [ NO_SANITIZER, AUSAN, MSAN, EFENCE, FUZZER ] # NO_SANITIZER=1 is a no-op - os: [ ubuntu ] + os: [ ubuntu-22.04 ] cc: [ clang, gcc ] make: [ bmake ] # we test makefiles separately debug: [ DEBUG, RELEASE ] # RELEASE=1 is a no-op @@ -169,7 +172,7 @@ jobs: key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} - name: Dependencies (Ubuntu) - if: matrix.os == 'ubuntu' && steps.cache-build.outputs.cache-hit != 'true' + if: matrix.os == 'ubuntu-22.04' && steps.cache-build.outputs.cache-hit != 'true' run: | uname -a sudo apt-get install bmake electric-fence @@ -216,14 +219,14 @@ jobs: # of the build during CI, even if we don't run that during tests. test_makefiles: name: "Test (Makefiles) ${{ matrix.make }} ${{ matrix.os }} ${{ matrix.debug }}" - runs-on: ${{ matrix.os }}-latest + runs-on: ${{ matrix.os }} needs: [ checkout, build ] strategy: fail-fast: false matrix: san: [ NO_SANITIZER ] # NO_SANITIZER=1 is a no-op - os: [ ubuntu ] + os: [ ubuntu-22.04 ] cc: [ clang ] make: [ bmake, pmake ] debug: [ EXPENSIVE_CHECKS, DEBUG, RELEASE ] # RELEASE=1 is a no-op @@ -258,7 +261,7 @@ jobs: run: find ${{ env.wc }} -type f -name '*.c' | sort -r | head -5 | xargs touch - name: Dependencies (Ubuntu) - if: matrix.os == 'ubuntu' + if: matrix.os == 'ubuntu-22.04' run: | uname -a sudo apt-get install pmake bmake pcregrep @@ -301,14 +304,14 @@ jobs: test_san: name: "Test (Sanitizers) ${{ matrix.san }} ${{ matrix.cc }} ${{ matrix.os }} ${{ matrix.debug }}" - runs-on: ${{ matrix.os }}-latest + runs-on: ${{ matrix.os }} needs: [ build ] strategy: fail-fast: false matrix: san: [ AUSAN, MSAN, EFENCE ] - os: [ ubuntu ] + os: [ ubuntu-22.04 ] cc: [ clang, gcc ] make: [ bmake ] debug: [ DEBUG, RELEASE ] # RELEASE=1 is a no-op @@ -329,7 +332,7 @@ jobs: key: checkout-${{ github.sha }} - name: Dependencies (Ubuntu) - if: matrix.os == 'ubuntu' + if: matrix.os == 'ubuntu-22.04' run: | uname -a sudo apt-get install bmake pcregrep electric-fence @@ -362,7 +365,7 @@ jobs: test_fuzz: name: "Fuzz (mode ${{ matrix.mode }}) ${{ matrix.cc }} ${{ matrix.os }} ${{ matrix.debug }}" - runs-on: ${{ matrix.os }}-latest + runs-on: ${{ matrix.os }} timeout-minutes: 5 # this should never be reached, it's a safeguard for bugs in the fuzzer itself needs: [ build ] @@ -370,7 +373,7 @@ jobs: fail-fast: false matrix: san: [ FUZZER ] - os: [ ubuntu ] + os: [ ubuntu-22.04 ] cc: [ clang ] make: [ bmake ] debug: [ DEBUG, RELEASE ] # RELEASE=1 is a no-op @@ -388,7 +391,7 @@ jobs: key: checkout-${{ github.sha }} - name: Dependencies (Ubuntu) - if: matrix.os == 'ubuntu' + if: matrix.os == 'ubuntu-22.04' run: | uname -a sudo apt-get install bmake @@ -470,14 +473,14 @@ jobs: test_pcre: name: "Test (PCRE suite) ${{ matrix.lang }} ${{ matrix.san }} ${{ matrix.cc }} ${{ matrix.os }} ${{ matrix.debug }}" - runs-on: ${{ matrix.os }}-latest + runs-on: ${{ matrix.os }} needs: [ pcre_suite ] # and also build, but pcre_suite gives us that strategy: fail-fast: false matrix: san: [ AUSAN, MSAN, EFENCE ] - os: [ ubuntu ] + os: [ ubuntu-22.04 ] cc: [ clang, gcc ] make: [ bmake ] debug: [ DEBUG, RELEASE ] # RELEASE=1 is a no-op @@ -492,7 +495,7 @@ jobs: steps: - name: Dependencies (Ubuntu) - if: matrix.os == 'ubuntu' && matrix.san == 'EFENCE' + if: matrix.os == 'ubuntu-22.04' && matrix.san == 'EFENCE' run: | uname -a sudo apt-get install electric-fence @@ -506,7 +509,7 @@ jobs: ${{ matrix.cc }} --version - name: Dependencies (Ubuntu/Go) - if: matrix.os == 'ubuntu' && (matrix.lang == 'go' || matrix.lang == 'goasm') + if: matrix.os == 'ubuntu-22.04' && (matrix.lang == 'go' || matrix.lang == 'goasm') run: | uname -a sudo apt-get install golang @@ -524,14 +527,14 @@ jobs: id: cache-cvtpcre with: path: ${{ env.cvtpcre }} - key: cvtpcre-bmake-ubuntu-gcc-DEBUG-AUSAN-${{ github.sha }}-${{ env.pcre2 }} + key: cvtpcre-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }}-${{ env.pcre2 }} - name: Run PCRE suite (${{ matrix.lang }}) run: CC=${{ matrix.cc }} ./${{ env.build }}/bin/retest -O1 -l ${{ matrix.lang }} ${{ env.cvtpcre }}/*.tst docs: name: Documentation - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: [ checkout ] env: @@ -582,12 +585,12 @@ jobs: install: name: Install - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: [ build, docs ] env: san: NO_SANITIZER # NO_SANITIZER=1 is a no-op - os: ubuntu + os: ubuntu-22.04 cc: clang make: bmake debug: RELEASE # RELEASE=1 is a no-op @@ -601,7 +604,7 @@ jobs: key: prefix-${{ env.make }}-${{ env.os }}-${{ env.cc }}-${{ env.debug }}-${{ env.san }}-${{ github.sha }} - name: Dependencies (Ubuntu) - if: env.os == 'ubuntu' && steps.cache-prefix.outputs.cache-hit != 'true' + if: env.os == 'ubuntu-22.04' && steps.cache-prefix.outputs.cache-hit != 'true' run: | uname -a sudo apt-get install bmake @@ -643,12 +646,12 @@ jobs: fpm: name: Package ${{ matrix.pkg }} - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: [ install ] env: san: NO_SANITIZER # NO_SANITIZER=1 is a no-op - os: ubuntu + os: ubuntu-22.04 cc: clang make: bmake debug: RELEASE # RELEASE=1 is a no-op @@ -661,7 +664,7 @@ jobs: steps: - name: Dependencies (Ubuntu) - if: env.os == 'ubuntu' + if: env.os == 'ubuntu-22.04' run: | uname -a sudo gem install --no-document fpm From c89c15ddbb37d119671fd72219d994a89a413c4f Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 17 Jun 2025 12:24:28 -0400 Subject: [PATCH 06/80] retest: Remove `isatty` check and extra logging output. While discussing other issues, kate said she'd rather not have this have different behavior when stdout is / isn't a tty. --- src/retest/main.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/retest/main.c b/src/retest/main.c index 53eba216c..08850325b 100644 --- a/src/retest/main.c +++ b/src/retest/main.c @@ -56,7 +56,6 @@ struct match { struct match *next; }; -static int tty_output = 0; static int do_timing = 0; static int do_watchdog = 0; @@ -969,14 +968,6 @@ process_test_file(const char *filename, flagstring(flags, &flagdesc[0]); - if (tty_output) { - char *re = dup_str_esc(regexp, NULL); - printf("[ ] line %d: working on %s regexp /%s/%s ...\r", - linenum, dialect_name, re, flagdesc); - fflush(stdout); - free(re); - } - re_str = regexp; fsm = re_comp(dialect, fsm_sgetc, &re_str, alloc, flags, &err); if (fsm == NULL) { @@ -1230,9 +1221,6 @@ main(int argc, char *argv[]) int optlevel = 1; - /* is output to a tty or not? */ - tty_output = isatty(fileno(stdout)); - /* note these defaults are the opposite than for fsm(1) */ opt.anonymous_states = 1; opt.consolidate_edges = 1; From 825303f382619bba8d431fd45cb53192a43c4286 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 3 Jul 2025 12:54:25 -0400 Subject: [PATCH 07/80] Remove dependence on internal.h in print/ir.h and lx/print/c.c. print/ir.h only depended on it for FSM_SIGMA_COUNT in `struct ir_state_table`, and IR_TABLE isn't actually implemented, so this can be removed for now. lx/print/c.c only needed internal.h because of print/ir.h, and because of several direct accesses to fsm->statecount, which are easily replaced by calls to fsm_countstates. --- src/libfsm/print/ir.h | 6 +----- src/lx/print/c.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/libfsm/print/ir.h b/src/libfsm/print/ir.h index b375ba850..074097da3 100644 --- a/src/libfsm/print/ir.h +++ b/src/libfsm/print/ir.h @@ -91,11 +91,7 @@ struct ir_state { } error; struct { - /* Note: This is allocated separately, to avoid - * making the union significantly larger. */ - struct ir_state_table { - unsigned to[FSM_SIGMA_COUNT]; - } *table; + int not_yet_implemented; } table; } u; }; diff --git a/src/lx/print/c.c b/src/lx/print/c.c index dae4bd937..4f06475ac 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -18,7 +18,6 @@ #include -#include "libfsm/internal.h" /* XXX */ #include "libfsm/print/ir.h" /* XXX */ #include "lx/lx.h" @@ -38,7 +37,7 @@ skip(const struct fsm *fsm, fsm_state_t state) struct ast_mapping *m; assert(fsm != NULL); - assert(state < fsm->statecount); + assert(state < fsm_countstates(fsm)); if (!fsm_isend(fsm, state)) { return 1; @@ -100,7 +99,8 @@ shortest_example(const struct fsm *fsm, const struct ast_token *token, (void) fsm_getstart(fsm, &goal); min = INT_MAX; - for (i = 0; i < fsm->statecount; i++) { + const size_t statecount = fsm_countstates(fsm); + for (i = 0; i < statecount; i++) { const struct ast_mapping *m; int n; @@ -607,10 +607,11 @@ print_stateenum(FILE *f, const struct fsm *fsm) fprintf(f, "\tenum {\n"); fprintf(f, "\t\t"); - for (i = 0; i < fsm->statecount; i++) { + const size_t statecount = fsm_countstates(fsm); + for (i = 0; i < statecount; i++) { fprintf(f, "S%u, ", i); - if (i + 1 < fsm->statecount && (i + 1) % 10 == 0) { + if (i + 1 < statecount && (i + 1) % 10 == 0) { fprintf(f, "\n"); fprintf(f, "\t\t"); } @@ -718,7 +719,8 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, has_skips = 0; - for (i = 0; i < z->fsm->statecount; i++) { + const size_t statecount = fsm_countstates(z->fsm); + for (i = 0; i < statecount; i++) { int r; r = fsm_reachableall(z->fsm, i, skip); @@ -740,7 +742,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\n"); fprintf(f, "\t\tswitch (state) {\n"); - for (i = 0; i < z->fsm->statecount; i++) { + for (i = 0; i < statecount; i++) { int r; r = fsm_reachableall(z->fsm, i, skip); @@ -806,7 +808,8 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\tcase NONE: return %sEOF;\n", prefix.tok); - for (i = 0; i < z->fsm->statecount; i++) { + const size_t statecount = fsm_countstates(z->fsm); + for (i = 0; i < statecount; i++) { const struct ast_mapping *m; if (!fsm_isend(z->fsm, i)) { From cb42d58f69bfdf5b48a0c186d86ca1c236fa7e67 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 8 Jul 2025 12:03:27 -0400 Subject: [PATCH 08/80] Makefile: Check '*res*' not 'res*' for tests. This will miss failures in prefixed res files, such as build/tests/lxpos/dyn-fdgetc-getc-res0 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7075e432d..6f6d9a2e3 100644 --- a/Makefile +++ b/Makefile @@ -190,6 +190,6 @@ STAGE_BUILD := ${STAGE_BUILD:Nbin/cvtpcre} .if make(test) .END:: - grep FAIL ${BUILD}/tests/*/res*; [ $$? -ne 0 ] + grep FAIL ${BUILD}/tests/*/*res*; [ $$? -ne 0 ] .endif From 7824e6a223f09a3bb2965b3d45917e7397aa9da1 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 3 Jul 2025 14:29:34 -0400 Subject: [PATCH 09/80] Fix lx by rewriting much of its -l c codegen. Changing the leaf and endleaf callbacks to accept and reject in #485 broke lx, but it went unnoticed for a while. This fixes it. libfsm's normal execution mode evaluates a DFA, character by character, terminating either when the next character isn't a valid edge or the end of input is reached (in which case it checks end state metadata). lx's execution mode is a little different, because it's tokenizing -- instead of reading to the end of input, it should consume as much consecutive input that matches a particular token, then push back the last character read (so it can resume with it as context for the next token), yield the token type, and suspend. lx used to work by breaking abstraction and calling directly into `fsm_print_cfrag` (overriding the leaf behavior to yield token types, and adding an extra 'NONE' state to the generated state machine code), but when the callback interfaces shifted its internals no longer fit what lx expected. Now the reject hook is passed the same state metadata as the accept state, and the reject hook in lx checks whether the end id is associated with a particular AST mapping and token type. This is only implemented for the "c" output format, but similar changes could possibly make others usable without a lot more work. In particular, kate mentioned it'd be good to be able to use "vmc" output instead of "c" moving forward. Most of the code changes happen inside of lx's code generation, but there are a few elsewhere: - The reject hook now has a state_metadata pointer, so update the callers for all the output formats. - libfsm's 'c' output now includes a macro `FSM_ADVANCE_HOOK(C)`, which is called with the next character read in the FSM_IO_STR and FSM_IO_PAIR io modes immediately after advancing. This is used to inform lx's internal bookkeeping about token positions and buffering token names. FSM_IO_GETC doesn't need it, because its getc callback manages the character stream. The macro defaults to a no-op when undefined. - libfsm's 'c' output also includes a flag, `has_consumed_input`, so the code expanded in place from the reject/accept hooks can determine when the state machine input handler loop has consumed any input. This was previously encoded by the extra NONE state. lx's code generation using this flag is a bit cluttered, because the reject hook doesn't know whether it's expanding for the end states, but it's probably not worth changing the reject hook type signature to add another flag. This results in checks for has_consumed_input in code paths where trivial static analysis would show it to be dead code, and some extra unreachable code at the end of the function. --- include/fsm/print.h | 1 + src/libfsm/print.c | 3 +- src/libfsm/print.h | 1 + src/libfsm/print/awk.c | 2 +- src/libfsm/print/c.c | 62 +++++- src/libfsm/print/dot.c | 2 +- src/libfsm/print/fsm.c | 2 +- src/libfsm/print/go.c | 2 +- src/libfsm/print/llvm.c | 2 +- src/libfsm/print/rust.c | 2 +- src/libfsm/print/sh.c | 2 +- src/libfsm/print/vmasm.c | 2 +- src/libfsm/print/vmc.c | 2 +- src/libfsm/print/vmdot.c | 2 +- src/libfsm/print/vmops.c | 2 +- src/lx/main.c | 4 + src/lx/print/c.c | 459 ++++++++++++++++++++++----------------- src/lx/print/dump.c | 20 +- 18 files changed, 348 insertions(+), 224 deletions(-) diff --git a/include/fsm/print.h b/include/fsm/print.h index 9f7264e81..6fae2abea 100644 --- a/include/fsm/print.h +++ b/include/fsm/print.h @@ -75,6 +75,7 @@ struct fsm_hooks { void *lang_opaque, void *hook_opaque); int (*reject)(FILE *, const struct fsm_options *opt, + const struct fsm_state_metadata *state_metadata, void *lang_opaque, void *hook_opaque); int (*comment)(FILE *, const struct fsm_options *opt, diff --git a/src/libfsm/print.c b/src/libfsm/print.c index e1a425cc0..bdb092d96 100644 --- a/src/libfsm/print.c +++ b/src/libfsm/print.c @@ -111,6 +111,7 @@ int print_hook_reject(FILE *f, const struct fsm_options *opt, const struct fsm_hooks *hooks, + const struct fsm_state_metadata *state_metadata, int (*default_reject)(FILE *f, const struct fsm_options *opt, void *lang_opaque, void *hook_opaque), void *lang_opaque) @@ -124,7 +125,7 @@ print_hook_reject(FILE *f, } if (hooks->reject != NULL) { - return hooks->reject(f, opt, + return hooks->reject(f, opt, state_metadata, lang_opaque, hooks->hook_opaque); } else if (default_reject != NULL) { return default_reject(f, opt, diff --git a/src/libfsm/print.h b/src/libfsm/print.h index 0a8e2e0c1..bfedf0e62 100644 --- a/src/libfsm/print.h +++ b/src/libfsm/print.h @@ -44,6 +44,7 @@ int print_hook_reject(FILE *f, const struct fsm_options *opt, const struct fsm_hooks *hooks, + const struct fsm_state_metadata *state_metadata, int (*default_reject)(FILE *f, const struct fsm_options *opt, void *lang_opaque, void *hook_opaque), void *lang_opaque); diff --git a/src/libfsm/print/awk.c b/src/libfsm/print/awk.c index bd78e8ef7..4d952cbef 100644 --- a/src/libfsm/print/awk.c +++ b/src/libfsm/print/awk.c @@ -154,7 +154,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 22b03963e..96117f21e 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -210,7 +210,7 @@ print_groups(FILE *f, const struct fsm_options *opt, } static int -print_case(FILE *f, const struct ir *ir, +print_case(FILE *f, const struct ir *ir, fsm_state_t state_id, const struct fsm_options *opt, const struct fsm_hooks *hooks, const char *cp, @@ -222,10 +222,16 @@ print_case(FILE *f, const struct ir *ir, assert(f != NULL); assert(cs != NULL); + assert(state_id < ir->n); + const struct fsm_state_metadata state_metadata = { + .end_ids = ir->states[state_id].endids.ids, + .end_id_count = ir->states[state_id].endids.count, + }; + switch (cs->strategy) { case IR_NONE: fprintf(f, "\t\t\t"); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, &state_metadata, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -254,7 +260,7 @@ print_case(FILE *f, const struct ir *ir, print_groups(f, opt, ir_indexof(ir, cs), cs->u.partial.groups, cs->u.partial.n); fprintf(f, "\t\t\tdefault: "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, &state_metadata, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -285,7 +291,7 @@ print_case(FILE *f, const struct ir *ir, print_ranges(f, opt, cs->u.error.error.ranges, cs->u.error.error.n); fprintf(f, " "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, &state_metadata, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -398,7 +404,7 @@ print_endstates(FILE *f, /* unexpected EOT */ fprintf(f, "\tdefault: "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -435,7 +441,7 @@ fsm_print_cfrag(FILE *f, const struct ir *ir, } fprintf(f, "\n"); - if (-1 == print_case(f, ir, opt, hooks, cp, &ir->states[i])) { + if (-1 == print_case(f, ir, i, opt, hooks, cp, &ir->states[i])) { return -1; } @@ -478,6 +484,41 @@ fsm_print_c_body(FILE *f, const struct ir *ir, } } + /* This flag indicates whether the any of the input stream was + * consumed before getting EOF and skipping over the state and + * character logic expanded here. + * + * lx needs to track this for proper EOF handling. It previously + * generated the state enum itself, so that it could include an + * additional 'NONE' state. Inside the input loop, the default + * state of NONE would be updated to the start state, but if the + * input loop was skipped it would still be NONE. */ + fprintf(f, "\tint has_consumed_input = 0;\n"); + + /* For FSM_IO_STR and FSM_IO_PAIR, define a macro that will be + * called with the new character every time iteration advances. + * This is used by lx's internal bookkeeping to track token + * positions in the input stream. For FSM_IO_GETC, the generated + * getc function handles this directly. + * + * This defaults to a no-op unless defined. */ + switch (opt->io) { + case FSM_IO_GETC: + break; /* nothing to do */ + + case FSM_IO_STR: + fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); + fprintf(f, "#define FSM_ADVANCE_HOOK(C) /* no-op */ (void)C\n"); + fprintf(f, "#endif\n"); + break; + + case FSM_IO_PAIR: + fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); + fprintf(f, "#define FSM_ADVANCE_HOOK(C) /* no-op */ (void)C\n"); + fprintf(f, "#endif\n"); + break; + } + /* enum of states */ print_stateenum(f, ir->n); fprintf(f, "\n"); @@ -489,14 +530,19 @@ fsm_print_c_body(FILE *f, const struct ir *ir, switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\twhile (c = fsm_getc(getc_opaque), c != EOF) {\n"); + fprintf(f, "\t\thas_consumed_input = 1;\n"); break; case FSM_IO_STR: fprintf(f, "\tfor (p = s; *p != '\\0'; p++) {\n"); + fprintf(f, "\t\thas_consumed_input = 1;\n"); + fprintf(f, "\t\tFSM_ADVANCE_HOOK(%s);\n", cp); break; case FSM_IO_PAIR: fprintf(f, "\tfor (p = b; p != e; p++) {\n"); + fprintf(f, "\t\thas_consumed_input = 1;\n"); + fprintf(f, "\t\tFSM_ADVANCE_HOOK(%s);\n", cp); break; } @@ -507,6 +553,10 @@ fsm_print_c_body(FILE *f, const struct ir *ir, fprintf(f, "\t}\n"); fprintf(f, "\n"); + /* Suppress unused variable warning -- this is mainly for lx. */ + fprintf(f, "\t(void)has_consumed_input;\n"); + fprintf(f, "\n"); + /* end states */ if (-1 == print_endstates(f, opt, hooks, ir)) { return -1; diff --git a/src/libfsm/print/dot.c b/src/libfsm/print/dot.c index c1ac9b875..69cce10aa 100644 --- a/src/libfsm/print/dot.c +++ b/src/libfsm/print/dot.c @@ -226,7 +226,7 @@ print_dotfrag(FILE *f, if (!opt->anonymous_states) { fprintf(f, "\t%sS%-2u [ ", prefix, s); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, &s)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, &s)) { return -1; } diff --git a/src/libfsm/print/fsm.c b/src/libfsm/print/fsm.c index 06c7e3403..027531514 100644 --- a/src/libfsm/print/fsm.c +++ b/src/libfsm/print/fsm.c @@ -184,7 +184,7 @@ print_state(FILE *f, const struct fsm_options *opt, const struct fsm_hooks *hook assert(opt != NULL); if (!fsm_isend(fsm, s)) { - if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL, NULL)) { return -1; } } diff --git a/src/libfsm/print/go.c b/src/libfsm/print/go.c index 3575c5b75..1f6b7a0dc 100644 --- a/src/libfsm/print/go.c +++ b/src/libfsm/print/go.c @@ -185,7 +185,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC: assert(op->ret >= retlist->a); diff --git a/src/libfsm/print/llvm.c b/src/libfsm/print/llvm.c index 14b116555..98e76d46f 100644 --- a/src/libfsm/print/llvm.c +++ b/src/libfsm/print/llvm.c @@ -713,7 +713,7 @@ fsm_print_llvm(FILE *f, fprintf(f, "\n"); } fprintf(f, "\t "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, NULL)) { return -1; } fprintf(f, "\n"); diff --git a/src/libfsm/print/rust.c b/src/libfsm/print/rust.c index 682bc051f..c1fcf5bb4 100644 --- a/src/libfsm/print/rust.c +++ b/src/libfsm/print/rust.c @@ -174,7 +174,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC: assert(op->ret >= retlist->a); diff --git a/src/libfsm/print/sh.c b/src/libfsm/print/sh.c index 3c11c2f1f..5322e4ffc 100644 --- a/src/libfsm/print/sh.c +++ b/src/libfsm/print/sh.c @@ -202,7 +202,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; const struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/vmasm.c b/src/libfsm/print/vmasm.c index e29eda24d..bf5551420 100644 --- a/src/libfsm/print/vmasm.c +++ b/src/libfsm/print/vmasm.c @@ -44,7 +44,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL, NULL)) { return -1; } break; diff --git a/src/libfsm/print/vmc.c b/src/libfsm/print/vmc.c index 2fef2da65..9ea06f27a 100644 --- a/src/libfsm/print/vmc.c +++ b/src/libfsm/print/vmc.c @@ -175,7 +175,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/vmdot.c b/src/libfsm/print/vmdot.c index 512cc893b..57344cddb 100644 --- a/src/libfsm/print/vmdot.c +++ b/src/libfsm/print/vmdot.c @@ -126,7 +126,7 @@ print_end(FILE *f, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/vmops.c b/src/libfsm/print/vmops.c index 59de464f1..74fc6d350 100644 --- a/src/libfsm/print/vmops.c +++ b/src/libfsm/print/vmops.c @@ -146,7 +146,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, NULL)) { return -1; } break; diff --git a/src/lx/main.c b/src/lx/main.c index ea16ee70d..92098bfd9 100644 --- a/src/lx/main.c +++ b/src/lx/main.c @@ -695,6 +695,10 @@ main(int argc, char *argv[]) opt.comments = 0; } + if (lang == LX_PRINT_C) { + opt.fragment = 1; + } + { if (print_progress) { fprintf(stderr, "-- parsing:"); diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 4f06475ac..18f6d664a 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -18,18 +18,17 @@ #include -#include "libfsm/print/ir.h" /* XXX */ - #include "lx/lx.h" #include "lx/ast.h" #include "lx/print.h" -/* XXX: abstraction */ -int -fsm_print_cfrag(FILE *f, const struct ir *ir, - const struct fsm_options *opt, - const struct fsm_hooks *hooks, - const char *cp); +struct lx_hook_env { + const struct ast *ast; + /* Name of variable for the current character of input in the + * current scope, which depends on the IO options. */ + const char *cur_char_var; +}; + static int skip(const struct fsm *fsm, fsm_state_t state) @@ -137,6 +136,29 @@ shortest_example(const struct fsm *fsm, const struct ast_token *token, return 1; } +static const char * +buf_op_prefix(void) +{ + if (api_tokbuf & API_FIXEDBUF) { + return "fixed"; + } else if (api_tokbuf & API_DYNBUF) { + return "dyn"; + } else { + assert(!"buf is neither fixed nor dyn"); + return NULL; + } +} + +static void +unget_character(FILE *f, bool pop, const char *cur_char_var) +{ + fprintf(f, "%sungetc(lx, %s); ", prefix.api, cur_char_var); + if (pop && (~api_exclude & API_POS)) { + fprintf(f, "%s%spop(lx->buf_opaque); ", + prefix.api, buf_op_prefix()); + } +} + static int accept_c(FILE *f, const struct fsm_options *opt, const struct fsm_state_metadata *state_metadata, @@ -144,6 +166,7 @@ accept_c(FILE *f, const struct fsm_options *opt, { const struct ast *ast; const struct ast_mapping *m; + struct lx_hook_env *env = hook_opaque; assert(f != NULL); assert(opt != NULL); @@ -152,11 +175,20 @@ accept_c(FILE *f, const struct fsm_options *opt, assert(lang_opaque == NULL); assert(hook_opaque != NULL); - ast = hook_opaque; + ast = env->ast; m = ast_getendmappingbyendid(state_metadata->end_ids[0]); - /* XXX: don't need this if complete */ - fprintf(f, "%sungetc(lx, c); ", prefix.api); + /* re-sync before new call into zone */ + switch (opt->io) { + case FSM_IO_GETC: + break; + + case FSM_IO_STR: + case FSM_IO_PAIR: + fprintf(f, "lx->p = p; "); + break; + } + fprintf(f, "return "); if (m->to != NULL) { fprintf(f, "lx->z = z%u, ", zindexof(ast, m->to)); @@ -168,12 +200,12 @@ accept_c(FILE *f, const struct fsm_options *opt, fprintf(f, "lx->z(lx)"); } fprintf(f, ";"); - return 0; } static int reject_c(FILE *f, const struct fsm_options *opt, + const struct fsm_state_metadata *state_metadata, void *lang_opaque, void *hook_opaque) { assert(f != NULL); @@ -182,7 +214,58 @@ reject_c(FILE *f, const struct fsm_options *opt, assert(hook_opaque != NULL); (void) lang_opaque; - (void) hook_opaque; + struct lx_hook_env *env = hook_opaque; + + const struct ast_mapping *m = state_metadata != NULL && state_metadata->end_id_count > 0 + ? ast_getendmappingbyendid(state_metadata->end_ids[0]) + : NULL; + + /* If there is an AST mapping associated with this end state, + * then unget the previous character (in most cases), and + * possibly emit its token type and/or new z state. */ + if (m != NULL) { + const bool has_endids = state_metadata && state_metadata->end_id_count > 0; + if (m->token == NULL && m->to == NULL && !has_endids) { + unget_character(f, true, env->cur_char_var); + } else if (m->token == NULL && m->to == NULL && has_endids) { + unget_character(f, true, env->cur_char_var); + } else if (m->token == NULL && m->to != NULL) { + unget_character(f, true, env->cur_char_var); + } else if (m->token != NULL && m->to == NULL) { + assert(has_endids); + unget_character(f, true, env->cur_char_var); + } else if (m->token != NULL && m->to != NULL) { + unget_character(f, true, env->cur_char_var); + } + + /* re-sync before new call into zone */ + switch (opt->io) { + case FSM_IO_GETC: + break; + + case FSM_IO_STR: + case FSM_IO_PAIR: + fprintf(f, "lx->p = p; "); + break; + } + + fprintf(f, "return "); + if (m->to != NULL) { + fprintf(f, "lx->z = z%u, ", zindexof(env->ast, m->to)); + } + if (m->token != NULL) { + fprintf(f, "%s", prefix.tok); + esctok(f, m->token->s); + } else { + fprintf(f, "lx->z(lx)"); + } + fprintf(f, ";"); + return 0; + } else { + fprintf(f, "\n\t\t\t\tif (!has_consumed_input) { return TOK_EOF; }\n"); + fprintf(f, "\t\t\t\t"); + unget_character(f, false, env->cur_char_var); + } /* XXX: don't need this if complete */ switch (opt->io) { @@ -191,16 +274,12 @@ reject_c(FILE *f, const struct fsm_options *opt, break; case FSM_IO_STR: - fprintf(f, "lx->p = NULL; "); - break; - case FSM_IO_PAIR: fprintf(f, "lx->p = NULL; "); break; } fprintf(f, "return %sUNKNOWN;", prefix.tok); - return 0; } @@ -216,20 +295,32 @@ print_proto(FILE *f, const struct ast *ast, const struct ast_zone *z) } static void -print_lgetc(FILE *f) +print_lgetc(FILE *f, const struct fsm_options *opt) { if (api_getc & API_FGETC) { if (print_progress) { fprintf(stderr, " fgetc"); } + if (opt->comments) { + fprintf(f, "/* Get a character from fgetc and push it to the buffer */\n"); + } fprintf(f, "int\n"); fprintf(f, "%sfgetc(struct %slx *lx)\n", prefix.api, prefix.lx); fprintf(f, "{\n"); fprintf(f, "\tassert(lx != NULL);\n"); fprintf(f, "\tassert(lx->getc_opaque != NULL);\n"); fprintf(f, "\n"); - fprintf(f, "\treturn fgetc(lx->getc_opaque);\n"); + + fprintf(f, "\tconst int c = fgetc(lx->getc_opaque);\n"); + fprintf(f, "\tif (c == EOF) {\n"); + fprintf(f, "\t\tlx->c = EOF;\n"); + fprintf(f, "\t\treturn EOF;\n"); + fprintf(f, "\t} else {\n"); + + fprintf(f, "\t\treturn c;\n"); + fprintf(f, "\t}\n"); + fprintf(f, "}\n"); fprintf(f, "\n"); } @@ -346,22 +437,61 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(stderr, " io"); } - /* TODO: consider passing char *c, and return int 0/-1 for error */ - fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); - fprintf(f, "inline\n"); - fprintf(f, "#endif\n"); fprintf(f, "static int\n"); - fprintf(f, "lx_getc(struct %slx *lx)\n", prefix.lx); + fprintf(f, "lx_advance_end(struct lx *lx, int c)\n"); fprintf(f, "{\n"); - fprintf(f, "\tint c;\n"); - fprintf(f, "\n"); + if (~api_exclude & API_POS) { + fprintf(f, "\tlx->end.byte++;\n"); + fprintf(f, "\tlx->end.col++;\n"); - fprintf(f, "\tassert(lx != NULL);\n"); + fprintf(f, "\tif (c == '\\n') {\n"); + fprintf(f, "\t\tlx->end.line++;\n"); + fprintf(f, "\t\tlx->end.saved_col = lx->end.col - 1;\n"); + fprintf(f, "\t\tlx->end.col = 1;\n"); + + if (opt->io == FSM_IO_STR) { /* ignore terminating '\0' */ + fprintf(f, "\t} else if (c == '\\0') { /* don't count terminating '\\0' */\n"); + fprintf(f, "\t\tlx->end.byte--;\n"); + fprintf(f, "\t\tlx->end.col--;\n"); + fprintf(f, "\t}\n"); + } else { + fprintf(f, "\t}\n"); + } + } + + if (~api_exclude & API_BUF) { + fprintf(f, "\tif (lx->push != NULL) {\n"); + fprintf(f, "\t\tif (-1 == lx->push(lx->buf_opaque, (char)c)) {\n"); + fprintf(f, "\t\t\treturn 0;\n"); + fprintf(f, "\t\t}\n"); + fprintf(f, "\t}\n"); + } + + fprintf(f, "\treturn 1;\n"); + fprintf(f, "}\n"); + fprintf(f, "\n"); switch (opt->io) { case FSM_IO_GETC: + /* TODO: consider passing char *c, and return int 0/-1 for error */ + if (opt->comments) { + fprintf(f, "/* This wrapper manages one character of lookahead/pushback\n"); + fprintf(f, " * and the line, column, and byte offsets. */\n"); + } + fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); + fprintf(f, "inline\n"); + fprintf(f, "#endif\n"); + fprintf(f, "static int\n"); + fprintf(f, "lx_getc(struct %slx *lx)\n", prefix.lx); + fprintf(f, "{\n"); + fprintf(f, "\tint c;\n"); + fprintf(f, "\n"); + + fprintf(f, "\tassert(lx != NULL);\n"); + fprintf(f, "\tassert(lx->lgetc != NULL);\n"); fprintf(f, "\n"); + fprintf(f, "\tif (lx->c != EOF) {\n"); fprintf(f, "\t\tc = lx->c, lx->c = EOF;\n"); fprintf(f, "\t} else {\n"); @@ -371,54 +501,39 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(f, "\t\t}\n"); fprintf(f, "\t}\n"); fprintf(f, "\n"); - break; - case FSM_IO_STR: - /* - * For FSM_IO_STR we treat '\0' as the end of input, - * and so there's no need to distinguish it from EOF. - * We return '\0' here to save the assignment. - */ - fprintf(f, "\tassert(lx->p != NULL);\n"); + /* FIXME: This should distinguish between alloc failure + * and EOF, but will require layers of interface changes. */ + fprintf(f, "\tif (!lx_advance_end(lx, c)) { return EOF; }\n"); fprintf(f, "\n"); - fprintf(f, "\tc = *lx->p++;\n"); - fprintf(f, "\n"); - break; - case FSM_IO_PAIR: - fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\n"); - fprintf(f, "\tif (lx->p == lx->e) {\n"); - fprintf(f, "\t\t\treturn EOF;\n"); - fprintf(f, "\t}\n"); - fprintf(f, "\n"); - fprintf(f, "\tc = *lx->p++;\n"); + fprintf(f, "\treturn c;\n"); + fprintf(f, "}\n"); fprintf(f, "\n"); - break; - } - if (~api_exclude & API_POS) { - fprintf(f, "\tlx->end.byte++;\n"); - fprintf(f, "\tlx->end.col++;\n"); + /* Add an implementation of fsm_getc that calls back + * into lx_getc with the lx handle. */ + fprintf(f, "/* This wrapper adapts calling lx_getc to the interface\n"); + fprintf(f, " * in libfsm's generated code. */\n"); + fprintf(f, "static int\n"); + fprintf(f, "fsm_getc(void *getc_opaque)\n"); + fprintf(f, "{\n"); + + fprintf(f, "\treturn lx_getc((struct lx *)getc_opaque);\n"); + fprintf(f, "}\n"); fprintf(f, "\n"); - fprintf(f, "\tif (c == '\\n') {\n"); - fprintf(f, "\t\tlx->end.line++;\n"); - fprintf(f, "\t\tlx->end.saved_col = lx->end.col - 1;\n"); - fprintf(f, "\t\tlx->end.col = 1;\n"); + break; - if (opt->io == FSM_IO_STR) { /* ignore terminating '\0' */ - fprintf(f, "\t} else if (c == '\\0') { /* don't count terminating '\\0' */\n"); - fprintf(f, "\t\tlx->end.byte--;\n"); - fprintf(f, "\t\tlx->end.col--;\n"); - fprintf(f, "\t}\n"); - } else { - fprintf(f, "\t}\n"); - } + case FSM_IO_PAIR: + case FSM_IO_STR: + /* When libfsm's generated code advances a character, update + * lx's token name buffer and position bookkeeping. */ + fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); + fprintf(f, "#define FSM_ADVANCE_HOOK(C) if (!lx_advance_end(lx, C)) { return TOK_ERROR; }\n"); + fprintf(f, "#endif\n"); fprintf(f, "\n"); + break; } - fprintf(f, "\treturn c;\n"); - fprintf(f, "}\n"); - fprintf(f, "\n"); fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); fprintf(f, "inline\n"); @@ -431,30 +546,19 @@ print_io(FILE *f, const struct fsm_options *opt) switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\tassert(lx->c == EOF);\n"); - fprintf(f, "\n"); fprintf(f, "\tlx->c = c;\n"); - fprintf(f, "\n"); break; case FSM_IO_STR: fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\tassert(*(lx->p - 1) == c);\n"); - fprintf(f, "\n"); - fprintf(f, "\tlx->p--;\n"); - fprintf(f, "\n"); break; case FSM_IO_PAIR: fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\tassert(*(lx->p - 1) == c);\n"); - fprintf(f, "\n"); - fprintf(f, "\tlx->p--;\n"); - fprintf(f, "\n"); break; } if (~api_exclude & API_POS) { - fprintf(f, "\n"); fprintf(f, "\tlx->end.byte--;\n"); fprintf(f, "\tlx->end.col--;\n"); fprintf(f, "\n"); @@ -468,7 +572,7 @@ print_io(FILE *f, const struct fsm_options *opt) } static void -print_buf(FILE *f) +print_buf(FILE *f, const struct fsm_options *opt) { if (api_tokbuf & API_DYNBUF) { if (print_progress) { @@ -518,7 +622,28 @@ print_buf(FILE *f) fprintf(f, "}\n"); fprintf(f, "\n"); + + if (~api_exclude & API_BUF) { + fprintf(f, "static void\n"); + fprintf(f, "%sdynpop(void *buf_opaque)\n", prefix.api); + fprintf(f, "{\n"); + fprintf(f, "\tstruct lx_dynbuf *t = buf_opaque;\n"); + fprintf(f, "\n"); + fprintf(f, "\tassert(t != NULL);\n"); + fprintf(f, "\n"); + + if (opt->io == FSM_IO_GETC) { + fprintf(f, "\tassert(t->p != t->a);\n"); + } + + fprintf(f, "\tt->p--;\n"); + + fprintf(f, "}\n"); + fprintf(f, "\n"); + } + fprintf(f, "int\n"); + /* FIXME: handle error from dynclear */ fprintf(f, "%sdynclear(void *buf_opaque)\n", prefix.api); fprintf(f, "{\n"); fprintf(f, "\tstruct lx_dynbuf *t = buf_opaque;\n"); @@ -542,6 +667,7 @@ print_buf(FILE *f) fprintf(f, "\n"); fprintf(f, "\tt->p = t->a;\n"); fprintf(f, "\n"); + fprintf(f, "\treturn 0;\n"); fprintf(f, "}\n"); fprintf(f, "\n"); @@ -582,6 +708,24 @@ print_buf(FILE *f) fprintf(f, "}\n"); fprintf(f, "\n"); + if (~api_exclude & API_BUF) { + fprintf(f, "static void\n"); + fprintf(f, "%sfixedpop(void *buf_opaque)\n", prefix.api); + fprintf(f, "{\n"); + fprintf(f, "\tstruct lx_fixedbuf *t = buf_opaque;\n"); + fprintf(f, "\n"); + fprintf(f, "\tassert(t != NULL);\n"); + fprintf(f, "\tassert(t->p != NULL);\n"); + fprintf(f, "\tassert(t->a != NULL);\n"); + + if (opt->io == FSM_IO_GETC) { + fprintf(f, "\tassert(t->p > t->a);\n"); + } + fprintf(f, "\tt->p--;\n"); + fprintf(f, "}\n"); + fprintf(f, "\n"); + } + fprintf(f, "int\n"); fprintf(f, "%sfixedclear(void *buf_opaque)\n", prefix.api); fprintf(f, "{\n"); @@ -599,50 +743,33 @@ print_buf(FILE *f) } } -static void -print_stateenum(FILE *f, const struct fsm *fsm) -{ - fsm_state_t i; - - fprintf(f, "\tenum {\n"); - fprintf(f, "\t\t"); - - const size_t statecount = fsm_countstates(fsm); - for (i = 0; i < statecount; i++) { - fprintf(f, "S%u, ", i); - - if (i + 1 < statecount && (i + 1) % 10 == 0) { - fprintf(f, "\n"); - fprintf(f, "\t\t"); - } - } - - fprintf(f, "NONE"); - - fprintf(f, "\n"); - fprintf(f, "\t} state;\n"); -} - static int print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, - const struct fsm_options *opt, const char *cp) + const struct fsm_options *opt, const char *cur_char_var) { assert(f != NULL); assert(z != NULL); assert(z->fsm != NULL); assert(fsm_all(z->fsm, fsm_isdfa)); assert(ast != NULL); - assert(cp != NULL); + assert(cur_char_var != NULL); - /* TODO: prerequisite that the FSM is a DFA */ + /* prerequisite that the FSM is a DFA */ + assert(fsm_all(z->fsm, fsm_isdfa)); fprintf(f, "static enum %stoken\n", prefix.api); fprintf(f, "z%u(struct %slx *lx)\n", zindexof(ast, z), prefix.lx); fprintf(f, "{\n"); - fprintf(f, "\tint c;\n"); - fprintf(f, "\n"); - print_stateenum(f, z->fsm); + switch (opt->io) { + case FSM_IO_GETC: + fprintf(f, "\tint c;\n"); + break; + case FSM_IO_STR: + case FSM_IO_PAIR: + break; + } + fprintf(f, "\n"); fprintf(f, "\tassert(lx != NULL);\n"); @@ -655,9 +782,6 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\n"); } - fprintf(f, "\tstate = NONE;\n"); - fprintf(f, "\n"); - if (~api_exclude & API_POS) { fprintf(f, "\tlx->start = lx->end;\n"); fprintf(f, "\n"); @@ -665,52 +789,37 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, switch (opt->io) { case FSM_IO_GETC: - fprintf(f, "\twhile (c = lx_getc(lx), c != EOF) {\n"); + fprintf(f, "\tvoid *getc_opaque = (void *)lx;\n"); + + /* This must be called with fragment sent, otherwise + * it will generate a nested function definition. */ + assert(opt->fragment); break; case FSM_IO_STR: - fprintf(f, "\twhile (c = lx_getc(lx), c != '\\0') {\n"); + fprintf(f, "const char *s = lx->p;\n"); + fprintf(f, "const char *p;\n"); break; case FSM_IO_PAIR: - fprintf(f, "\twhile (c = lx_getc(lx), c != EOF) {\n"); + fprintf(f, "\tconst char *p, *b = lx->p, *e = lx->e;\n"); break; } - { - fsm_state_t start; - - if (!fsm_getstart(z->fsm, &start)) { - errno = EINVAL; - return -1; - } - - fprintf(f, "\t\tif (state == NONE) {\n"); - fprintf(f, "\t\t\tstate = S%u;\n", start); - fprintf(f, "\t\t}\n"); - fprintf(f, "\n"); - } - { static const struct fsm_hooks defaults; struct fsm_hooks hooks = defaults; - struct ir *ir; - assert(cp != NULL); + struct lx_hook_env hook_env = { + .ast = ast, + .cur_char_var = cur_char_var, + }; hooks.accept = accept_c; hooks.reject = reject_c; - hooks.hook_opaque = (void *) ast; - - ir = make_ir(z->fsm, opt); - if (ir == NULL) { - /* TODO */ - } - - /* XXX: abstraction */ - (void) fsm_print_cfrag(f, ir, opt, &hooks, cp); + hooks.hook_opaque = &hook_env; - free_ir(z->fsm, ir); + fsm_print(f, z->fsm, opt, &hooks, FSM_PRINT_C); } if (~api_exclude & API_BUF) { @@ -762,7 +871,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\t\tdefault:\n"); fprintf(f, "\t\t\tif (lx->push != NULL) {\n"); - fprintf(f, "\t\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cp); + fprintf(f, "\t\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cur_char_var); fprintf(f, "\t\t\t\t\treturn %sERROR;\n", prefix.tok); fprintf(f, "\t\t\t\t}\n"); fprintf(f, "\t\t\t}\n"); @@ -773,20 +882,16 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, } else { fprintf(f, "\n"); fprintf(f, "\t\tif (lx->push != NULL) {\n"); - fprintf(f, "\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cp); + fprintf(f, "\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cur_char_var); fprintf(f, "\t\t\t\treturn %sERROR;\n", prefix.tok); fprintf(f, "\t\t\t}\n"); fprintf(f, "\t\t}\n"); } } - fprintf(f, "\t}\n"); - fprintf(f, "\n"); { - fsm_state_t i; - switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\tlx->lgetc = NULL;\n"); @@ -804,42 +909,10 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, break; } - fprintf(f, "\tswitch (state) {\n"); - - fprintf(f, "\tcase NONE: return %sEOF;\n", prefix.tok); - - const size_t statecount = fsm_countstates(z->fsm); - for (i = 0; i < statecount; i++) { - const struct ast_mapping *m; - - if (!fsm_isend(z->fsm, i)) { - continue; - } - - m = ast_getendmapping(z->fsm, i); - if (LOG()) { - fprintf(stderr, "print_zone: ast_getendmapping for state %d: %p (c)\n", - i, (void *)m); - } - assert(m != NULL); - - fprintf(f, "\tcase S%u: return ", (unsigned) i); - - /* note: no point in changing zone here, because getc is now NULL */ - - if (m->token == NULL) { - fprintf(f, "%sEOF;\n", prefix.tok); - } else { - /* TODO: maybe make a printf-like little language to simplify this */ - fprintf(f, "%s", prefix.tok); - esctok(f, m->token->s); - fprintf(f, ";\n"); - } - } - - fprintf(f, "\tdefault: errno = EINVAL; return %sERROR;\n", prefix.tok); - - fprintf(f, "\t}\n"); + fprintf(f, "\tif (!has_consumed_input) {\n"); + fprintf(f, "\t\treturn TOK_EOF;\n"); + fprintf(f, "\t} \n"); + fprintf(f, "\n"); } fprintf(f, "}\n\n"); @@ -970,8 +1043,8 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) switch (opt->io) { case FSM_IO_GETC: cp = "c"; break; - case FSM_IO_STR: cp = "c"; break; - case FSM_IO_PAIR: cp = "c"; break; + case FSM_IO_STR: cp = "*p"; break; + case FSM_IO_PAIR: cp = "*p"; break; } for (z = ast->zl; z != NULL; z = z->next) { @@ -1009,9 +1082,9 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\n"); print_io(f, opt); - print_lgetc(f); + print_lgetc(f, opt); - print_buf(f); + print_buf(f, opt); if (print_progress) { zn = 0; @@ -1044,6 +1117,7 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "void\n"); fprintf(f, "%sinit(struct %slx *lx)\n", prefix.api, prefix.lx); fprintf(f, "{\n"); + fprintf(f, "\tstatic const struct %slx lx_default;\n", prefix.lx); fprintf(f, "\n"); fprintf(f, "\tassert(lx != NULL);\n"); @@ -1051,16 +1125,8 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t*lx = lx_default;\n"); fprintf(f, "\n"); - switch (opt->io) { - case FSM_IO_GETC: + if (opt->io == FSM_IO_GETC) { fprintf(f, "\tlx->c = EOF;\n"); - break; - - case FSM_IO_STR: - break; - - case FSM_IO_PAIR: - break; } fprintf(f, "\tlx->z = z%u;\n", zindexof(ast, ast->global)); @@ -1091,9 +1157,6 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) break; case FSM_IO_STR: - fprintf(f, "\tif (lx->p == NULL) {\n"); - break; - case FSM_IO_PAIR: fprintf(f, "\tif (lx->p == NULL) {\n"); break; diff --git a/src/lx/print/dump.c b/src/lx/print/dump.c index 162630f88..232b2eb31 100644 --- a/src/lx/print/dump.c +++ b/src/lx/print/dump.c @@ -201,10 +201,12 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tbuf.len = 0;\n"); fprintf(f, "\n"); - fprintf(f, "\tlx.buf_opaque = &buf;\n"); - fprintf(f, "\tlx.push = lx_dynpush;\n"); - fprintf(f, "\tlx.clear = lx_dynclear;\n"); - fprintf(f, "\tlx.free = lx_dynfree;\n"); + if (~api_exclude & API_BUF) { + fprintf(f, "\tlx.buf_opaque = &buf;\n"); + fprintf(f, "\tlx.push = lx_dynpush;\n"); + fprintf(f, "\tlx.clear = lx_dynclear;\n"); + fprintf(f, "\tlx.free = lx_dynfree;\n"); + } fprintf(f, "\n"); break; @@ -214,10 +216,12 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tbuf.len = sizeof a;\n"); /* XXX: rename .len to .size */ fprintf(f, "\n"); - fprintf(f, "\tlx.buf_opaque = &buf;\n"); - fprintf(f, "\tlx.push = lx_fixedpush;\n"); - fprintf(f, "\tlx.clear = lx_fixedclear;\n"); - fprintf(f, "\tlx.free = NULL;\n"); + if (~api_exclude & API_BUF) { + fprintf(f, "\tlx.buf_opaque = &buf;\n"); + fprintf(f, "\tlx.push = lx_fixedpush;\n"); + fprintf(f, "\tlx.clear = lx_fixedclear;\n"); + fprintf(f, "\tlx.free = NULL;\n"); + } fprintf(f, "\n"); break; } From 26e2b139468000d151852bc88e1c28f5af249548 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 22 Jul 2025 10:08:07 -0400 Subject: [PATCH 10/80] test/lxpos: Update test data (EOF position info). Instead of having the EOF token occupy the same byte, line, and column position as the last token, it should immediately follow. The new lx codegen behaves this way, and katef and I decided that it made sense to keep it like that, as long as it's consistent. --- tests/lxpos/out0.dump | 2 +- tests/lxpos/out1.dump | 2 +- tests/lxpos/out2.dump | 2 +- tests/lxpos/out3.dump | 2 +- tests/lxpos/out4.dump | 2 +- tests/lxpos/out5.dump | 2 +- tests/lxpos/out6.dump | 2 +- tests/lxpos/out7.dump | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/lxpos/out0.dump b/tests/lxpos/out0.dump index 347bb87dc..8f1a4ac1d 100644 --- a/tests/lxpos/out0.dump +++ b/tests/lxpos/out0.dump @@ -2,4 +2,4 @@ 6-12:1,7-13: 12-13:1-2,13-1: -12-13:1-2,13-1: +13:2,1: diff --git a/tests/lxpos/out1.dump b/tests/lxpos/out1.dump index bf4d9bcb9..f3432e1e3 100644 --- a/tests/lxpos/out1.dump +++ b/tests/lxpos/out1.dump @@ -6,4 +6,4 @@ 19-25:2,7-13: 25-26:2-3,13-1: -25-26:2-3,13-1: +26:3,1: diff --git a/tests/lxpos/out2.dump b/tests/lxpos/out2.dump index cf5aa7013..bdd9e90a6 100644 --- a/tests/lxpos/out2.dump +++ b/tests/lxpos/out2.dump @@ -9,4 +9,4 @@ 12-13:1,13-14: 13-14:1-2,14-1: -13-14:1-2,14-1: +14:2,1: diff --git a/tests/lxpos/out3.dump b/tests/lxpos/out3.dump index 92d9e915a..e7f0f1e70 100644 --- a/tests/lxpos/out3.dump +++ b/tests/lxpos/out3.dump @@ -5,4 +5,4 @@ 7-8:1,8-9: 35-36:1-2,36-1: -35-36:1-2,36-1: +36:2,1: diff --git a/tests/lxpos/out4.dump b/tests/lxpos/out4.dump index 6161924b0..c3bb9bb2d 100644 --- a/tests/lxpos/out4.dump +++ b/tests/lxpos/out4.dump @@ -8,4 +8,4 @@ 27-28:1-2,28-1: 28-29:2,1-2: -28-29:2,1-2: +29:2,2: diff --git a/tests/lxpos/out5.dump b/tests/lxpos/out5.dump index e2506a1df..040b6734a 100644 --- a/tests/lxpos/out5.dump +++ b/tests/lxpos/out5.dump @@ -7,4 +7,4 @@ 4-5:3,1-2: 5-6:3-4,2-1: -5-6:3-4,2-1: +6:4,1: diff --git a/tests/lxpos/out6.dump b/tests/lxpos/out6.dump index 8036b2ad4..395d995be 100644 --- a/tests/lxpos/out6.dump +++ b/tests/lxpos/out6.dump @@ -1,2 +1,2 @@ 0-1:1,1-2: -4-5:1,5-6: +5:1,6: diff --git a/tests/lxpos/out7.dump b/tests/lxpos/out7.dump index 7eeab8bcb..0706b7981 100644 --- a/tests/lxpos/out7.dump +++ b/tests/lxpos/out7.dump @@ -2,4 +2,4 @@ 17-18:2,12-13: 18-19:2-3,13-1: -18-19:2-3,13-1: +19:3,1: From 862a68cb41726de9b3f83e691d77ba5677f8dd08 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 22 Jul 2025 16:19:43 -0400 Subject: [PATCH 11/80] Re-enable lxpos tests. Add `${LX}` as a dependency for the targets using it. Remove the `getcio=${io}` and `io=${io}` arguments to cat. Those may be a merge error? They just produce a warning. --- Makefile | 3 +-- tests/lxpos/Makefile | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 6f6d9a2e3..b9e196d7b 100644 --- a/Makefile +++ b/Makefile @@ -124,8 +124,7 @@ SUBDIR += tests/fsm SUBDIR += tests/glob SUBDIR += tests/like SUBDIR += tests/literal -# FIXME: commenting this out for now due to Makefile error -#SUBDIR += tests/lxpos +SUBDIR += tests/lxpos SUBDIR += tests/minimise SUBDIR += tests/native SUBDIR += tests/pcre diff --git a/tests/lxpos/Makefile b/tests/lxpos/Makefile index da2263d24..9f6d464c4 100644 --- a/tests/lxpos/Makefile +++ b/tests/lxpos/Makefile @@ -30,13 +30,13 @@ CFLAGS.${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump += -I ${BUILD}/tests/lxpos .for ext in c h -${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.${ext}: tests/lxpos/lexer.lx +${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.${ext}: tests/lxpos/lexer.lx ${LX} ${LX} -l ${ext} ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ || { rm -f $@; false; } .endfor -${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump.c: tests/lxpos/lexer.lx +${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump.c: tests/lxpos/lexer.lx ${LX} ${LX} -l dump ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ || { rm -f $@; false; } @@ -53,11 +53,11 @@ CLEAN += ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump ${TEST_OUTDIR.tests/lxpos}/${buf}-${getc}-${io}-got${n}.dump: ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump ${TEST_SRCDIR.tests/lxpos}/in${n}.txt .if ${getc} != none - cat ${.ALLSRC:M*.txt} getcio=${io} \ + cat ${.ALLSRC:M*.txt} \ | ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump \ > $@ .else - cat ${.ALLSRC:M*.txt} io=${io} \ + cat ${.ALLSRC:M*.txt} \ | xargs -0 ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump \ > $@ .endif From affca78d8a5164ea3dea91c702d777c1a8951223 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 23 Jul 2025 11:13:37 -0400 Subject: [PATCH 12/80] Use $LX_BIN instead of $LX in lxpos makefile. Some of the CI test matrix builds set LX to 'true; echo lx', but that obviously won't work for tests that actually need to run lx in order to exercise its output. --- tests/lxpos/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/lxpos/Makefile b/tests/lxpos/Makefile index 9f6d464c4..abf11f58a 100644 --- a/tests/lxpos/Makefile +++ b/tests/lxpos/Makefile @@ -4,7 +4,7 @@ TEST.tests/lxpos != ls -1 tests/lxpos/out*.dump TEST_SRCDIR.tests/lxpos = tests/lxpos TEST_OUTDIR.tests/lxpos = ${BUILD}/tests/lxpos -LX?=${BUILD}/bin/lx +LX_BIN?=${BUILD}/bin/lx # for lx -l test LEXER += tests/lxpos/lexer.lx @@ -30,14 +30,14 @@ CFLAGS.${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump += -I ${BUILD}/tests/lxpos .for ext in c h -${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.${ext}: tests/lxpos/lexer.lx ${LX} - ${LX} -l ${ext} ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ +${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.${ext}: tests/lxpos/lexer.lx ${LX_BIN} + ${LX_BIN} -l ${ext} ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ || { rm -f $@; false; } .endfor -${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump.c: tests/lxpos/lexer.lx ${LX} - ${LX} -l dump ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ +${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump.c: tests/lxpos/lexer.lx ${LX_BIN} + ${LX_BIN} -l dump ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ || { rm -f $@; false; } ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump: ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.h From a8f0c590ab83cf285c66827a0e0e43e8f78a0fe7 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 23 Jul 2025 12:38:11 -0400 Subject: [PATCH 13/80] lx: Make -l dump's output call lx.free() when using dynamic buffer. --- src/lx/print/dump.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/lx/print/dump.c b/src/lx/print/dump.c index 232b2eb31..12cbf9e2e 100644 --- a/src/lx/print/dump.c +++ b/src/lx/print/dump.c @@ -307,6 +307,11 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t} while (t != TOK_ERROR && t != TOK_EOF && t != TOK_UNKNOWN);\n"); fprintf(f, "\n"); + if (api_tokbuf == API_DYNBUF && (~api_exclude & API_BUF)) { + fprintf(f, "\tlx.free(lx.buf_opaque);\n"); + fprintf(f, "\n"); + } + fprintf(f, "\treturn t == TOK_ERROR;\n"); fprintf(f, "}\n"); } From 552aa01161f3b0f3df980fd002e50a003d68f93f Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Jul 2025 10:31:12 -0400 Subject: [PATCH 14/80] lx: Use prefix.tok, not "TOK_". --- src/lx/print/c.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 18f6d664a..fff599a34 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -262,7 +262,7 @@ reject_c(FILE *f, const struct fsm_options *opt, fprintf(f, ";"); return 0; } else { - fprintf(f, "\n\t\t\t\tif (!has_consumed_input) { return TOK_EOF; }\n"); + fprintf(f, "\n\t\t\t\tif (!has_consumed_input) { return %sEOF; }\n", prefix.tok); fprintf(f, "\t\t\t\t"); unget_character(f, false, env->cur_char_var); } @@ -529,7 +529,7 @@ print_io(FILE *f, const struct fsm_options *opt) /* When libfsm's generated code advances a character, update * lx's token name buffer and position bookkeeping. */ fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); - fprintf(f, "#define FSM_ADVANCE_HOOK(C) if (!lx_advance_end(lx, C)) { return TOK_ERROR; }\n"); + fprintf(f, "#define FSM_ADVANCE_HOOK(C) if (!lx_advance_end(lx, C)) { return %sERROR; }\n", prefix.tok); fprintf(f, "#endif\n"); fprintf(f, "\n"); break; @@ -910,7 +910,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, } fprintf(f, "\tif (!has_consumed_input) {\n"); - fprintf(f, "\t\treturn TOK_EOF;\n"); + fprintf(f, "\t\treturn %sEOF;\n", prefix.tok); fprintf(f, "\t} \n"); fprintf(f, "\n"); } From beebd1b57d673fa6599b0dcd9b1f845ed890bfcf Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Jul 2025 15:43:09 -0400 Subject: [PATCH 15/80] lx: return TOK_ERROR if reaching the end of a zone function. --- src/lx/print/c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index fff599a34..12601de58 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -912,6 +912,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\tif (!has_consumed_input) {\n"); fprintf(f, "\t\treturn %sEOF;\n", prefix.tok); fprintf(f, "\t} \n"); + fprintf(f, "\treturn %sERROR;", prefix.tok); fprintf(f, "\n"); } From ae53e94102890438854f824a0ee093c78265459c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 29 Jul 2025 15:51:35 -0400 Subject: [PATCH 16/80] lx: Handle unexpected EOF in pattern pairs. Add tests, update out6.dump. Add a couple test cases (in8-10.txt) with an unexpected end of input, either in the middle of a pattern, or after matching the first pattern in a .. pair, but without matching the second. Supporting this changes the expected result for in6.txt: Previously it resulted in TOK_EOF, now it leads to TOK_UNKNOWN and produces a "lexically uncategorised" error message for the unexpected end of input. This change is necessary for fixing #386 / #508, and more generally to detect things like unterminated string literals. --- src/lx/print/c.c | 6 ++++++ tests/lxpos/in10.txt | 1 + tests/lxpos/in8.txt | 1 + tests/lxpos/in9.txt | 1 + tests/lxpos/out10.dump | 7 +++++++ tests/lxpos/out6.dump | 2 +- tests/lxpos/out8.dump | 3 +++ tests/lxpos/out9.dump | 7 +++++++ 8 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 tests/lxpos/in10.txt create mode 100644 tests/lxpos/in8.txt create mode 100644 tests/lxpos/in9.txt create mode 100644 tests/lxpos/out10.dump create mode 100644 tests/lxpos/out8.dump create mode 100644 tests/lxpos/out9.dump diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 12601de58..21fbba180 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -192,6 +192,12 @@ accept_c(FILE *f, const struct fsm_options *opt, fprintf(f, "return "); if (m->to != NULL) { fprintf(f, "lx->z = z%u, ", zindexof(ast, m->to)); + } else if (m->to == NULL && m->token == NULL) { + /* If accept-ing here doesn't actually map to a token or + * a different zone, then it's stuck in the middle of a + * pattern pair like `'//' .. /\n/ -> $nl;` with an EOF, + * so tokenization should still fail. */ + fprintf(f, "%sUNKNOWN; ", prefix.tok); } if (m->token != NULL) { fprintf(f, "%s", prefix.tok); diff --git a/tests/lxpos/in10.txt b/tests/lxpos/in10.txt new file mode 100644 index 000000000..52cab2a4f --- /dev/null +++ b/tests/lxpos/in10.txt @@ -0,0 +1 @@ +d: r * t /*distance = rate * time* diff --git a/tests/lxpos/in8.txt b/tests/lxpos/in8.txt new file mode 100644 index 000000000..e5c01f9dd --- /dev/null +++ b/tests/lxpos/in8.txt @@ -0,0 +1 @@ +hello ` diff --git a/tests/lxpos/in9.txt b/tests/lxpos/in9.txt new file mode 100644 index 000000000..81d7792df --- /dev/null +++ b/tests/lxpos/in9.txt @@ -0,0 +1 @@ +d: r * t /*distance = rate * time diff --git a/tests/lxpos/out10.dump b/tests/lxpos/out10.dump new file mode 100644 index 000000000..4314c8beb --- /dev/null +++ b/tests/lxpos/out10.dump @@ -0,0 +1,7 @@ +0-1:1,1-2: +1-2:1,2-3: +3-4:1,4-5: +5-6:1,6-7: +7-8:1,8-9: +34-35:1-2,35-1: lexically uncategorised: ' +' diff --git a/tests/lxpos/out6.dump b/tests/lxpos/out6.dump index 395d995be..c11d09988 100644 --- a/tests/lxpos/out6.dump +++ b/tests/lxpos/out6.dump @@ -1,2 +1,2 @@ 0-1:1,1-2: -5:1,6: +4-5:1,5-6: lexically uncategorised: ' ' diff --git a/tests/lxpos/out8.dump b/tests/lxpos/out8.dump new file mode 100644 index 000000000..3300df2f6 --- /dev/null +++ b/tests/lxpos/out8.dump @@ -0,0 +1,3 @@ +0-5:1,1-6: +6-7:1,7-8: lexically uncategorised: '` +' diff --git a/tests/lxpos/out9.dump b/tests/lxpos/out9.dump new file mode 100644 index 000000000..1f3cf25bc --- /dev/null +++ b/tests/lxpos/out9.dump @@ -0,0 +1,7 @@ +0-1:1,1-2: +1-2:1,2-3: +3-4:1,4-5: +5-6:1,6-7: +7-8:1,8-9: +33-34:1-2,34-1: lexically uncategorised: ' +' From ea9c90b4a2d8ad32e62ca4349231f381b46b19f1 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 30 Jul 2025 10:56:48 -0400 Subject: [PATCH 17/80] lx: Rewrite logic to make the four cases explicit, fix dead code. This was previously ending up with a useless call to the current zone after returning the token ("case S1: return TOK_UNKNOWN; lx->z(lx);"), which led to a warning in CI [-Werror=implicit-fallthrough=]. --- src/lx/print/c.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 21fbba180..a03f4ed8b 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -190,20 +190,28 @@ accept_c(FILE *f, const struct fsm_options *opt, } fprintf(f, "return "); - if (m->to != NULL) { - fprintf(f, "lx->z = z%u, ", zindexof(ast, m->to)); - } else if (m->to == NULL && m->token == NULL) { - /* If accept-ing here doesn't actually map to a token or - * a different zone, then it's stuck in the middle of a - * pattern pair like `'//' .. /\n/ -> $nl;` with an EOF, - * so tokenization should still fail. */ - fprintf(f, "%sUNKNOWN; ", prefix.tok); - } - if (m->token != NULL) { - fprintf(f, "%s", prefix.tok); - esctok(f, m->token->s); + if (m->to == NULL) { + if (m->token == NULL) { + /* If accept-ing here doesn't actually map to a token or + * a different zone, then it's stuck in the middle of a + * pattern pair like `'//' .. /\n/ -> $nl;` with an EOF, + * so tokenization should still fail. */ + fprintf(f, "%sUNKNOWN", prefix.tok); + } else { + /* yield a token */ + fprintf(f, "%s", prefix.tok); + esctok(f, m->token->s); + } } else { - fprintf(f, "lx->z(lx)"); + if (m->token == NULL) { + /* update to a different zone, then call to it */ + fprintf(f, "lx->z = z%u, lx->z(lx)", zindexof(ast, m->to)); + } else { + /* update zone, then yield a token */ + fprintf(f, "lx->z = z%u, ", zindexof(ast, m->to)); + fprintf(f, "%s", prefix.tok); + esctok(f, m->token->s); + } } fprintf(f, ";"); return 0; From 17d415de73f3ed6e397ef187413e04454e6367e0 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 31 Jul 2025 13:25:49 -0400 Subject: [PATCH 18/80] lx: Only gen fixedpop / dynpop & calls to them when buffer mode is set. --- src/lx/print/c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index a03f4ed8b..bb48fb084 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -637,7 +637,7 @@ print_buf(FILE *f, const struct fsm_options *opt) fprintf(f, "\n"); - if (~api_exclude & API_BUF) { + if ((~api_exclude & API_BUF) && (api_tokbuf & API_DYNBUF)) { fprintf(f, "static void\n"); fprintf(f, "%sdynpop(void *buf_opaque)\n", prefix.api); fprintf(f, "{\n"); @@ -722,7 +722,7 @@ print_buf(FILE *f, const struct fsm_options *opt) fprintf(f, "}\n"); fprintf(f, "\n"); - if (~api_exclude & API_BUF) { + if (~api_exclude & API_BUF && (api_tokbuf & API_FIXEDBUF)) { fprintf(f, "static void\n"); fprintf(f, "%sfixedpop(void *buf_opaque)\n", prefix.api); fprintf(f, "{\n"); From 7ed18b94e2af4fa64440e2fde8adbdfc92af8a55 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 31 Jul 2025 13:29:28 -0400 Subject: [PATCH 19/80] lx: Suppress warning for possibly unused function. These may or may not be called, depending on the input. --- src/lx/print/c.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index bb48fb084..bfc25023e 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -1151,6 +1151,16 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tlx->end.line = 1;\n"); fprintf(f, "\tlx->end.col = 1;\n"); } + + /* Suppress warning for possibly unused function */ + if (~api_exclude & API_BUF) { + if (api_tokbuf & API_FIXEDBUF) { + fprintf(f, "\t(void)%sfixedpop;\n", prefix.api); + } else if (api_tokbuf & API_DYNBUF) { + fprintf(f, "\t(void)%sdynpop;\n", prefix.api); + } + } + fprintf(f, "}\n"); fprintf(f, "\n"); } From 1e55db8ae38001dc68b784c6abc07b43941b6fd7 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 31 Jul 2025 13:30:59 -0400 Subject: [PATCH 20/80] lx: Ensure prefix.api & prefix.lx are used in the generated code. In some cases this was hardcoding "lx_" in the generated code, which could lead to build failures if 'lx -e' was used to override the default prefix. --- src/lx/print/c.c | 12 ++++++------ src/lx/print/dump.c | 32 ++++++++++++++++---------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index bfc25023e..5ca179b5a 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -452,7 +452,7 @@ print_io(FILE *f, const struct fsm_options *opt) } fprintf(f, "static int\n"); - fprintf(f, "lx_advance_end(struct lx *lx, int c)\n"); + fprintf(f, "%sadvance_end(struct %slx *lx, int c)\n", prefix.api, prefix.lx); fprintf(f, "{\n"); if (~api_exclude & API_POS) { fprintf(f, "\tlx->end.byte++;\n"); @@ -496,7 +496,7 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(f, "inline\n"); fprintf(f, "#endif\n"); fprintf(f, "static int\n"); - fprintf(f, "lx_getc(struct %slx *lx)\n", prefix.lx); + fprintf(f, "%sgetc(struct %slx *lx)\n", prefix.api, prefix.lx); fprintf(f, "{\n"); fprintf(f, "\tint c;\n"); fprintf(f, "\n"); @@ -518,7 +518,7 @@ print_io(FILE *f, const struct fsm_options *opt) /* FIXME: This should distinguish between alloc failure * and EOF, but will require layers of interface changes. */ - fprintf(f, "\tif (!lx_advance_end(lx, c)) { return EOF; }\n"); + fprintf(f, "\tif (!%sadvance_end(lx, c)) { return EOF; }\n", prefix.api); fprintf(f, "\n"); fprintf(f, "\treturn c;\n"); @@ -527,13 +527,13 @@ print_io(FILE *f, const struct fsm_options *opt) /* Add an implementation of fsm_getc that calls back * into lx_getc with the lx handle. */ - fprintf(f, "/* This wrapper adapts calling lx_getc to the interface\n"); + fprintf(f, "/* This wrapper adapts calling %sgetc to the interface\n", prefix.api); fprintf(f, " * in libfsm's generated code. */\n"); fprintf(f, "static int\n"); fprintf(f, "fsm_getc(void *getc_opaque)\n"); fprintf(f, "{\n"); - fprintf(f, "\treturn lx_getc((struct lx *)getc_opaque);\n"); + fprintf(f, "\treturn %sgetc((struct %slx *)getc_opaque);\n", prefix.api, prefix.lx); fprintf(f, "}\n"); fprintf(f, "\n"); break; @@ -543,7 +543,7 @@ print_io(FILE *f, const struct fsm_options *opt) /* When libfsm's generated code advances a character, update * lx's token name buffer and position bookkeeping. */ fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); - fprintf(f, "#define FSM_ADVANCE_HOOK(C) if (!lx_advance_end(lx, C)) { return %sERROR; }\n", prefix.tok); + fprintf(f, "#define FSM_ADVANCE_HOOK(C) if (!%sadvance_end(lx, C)) { return %sERROR; }\n", prefix.api, prefix.tok); fprintf(f, "#endif\n"); fprintf(f, "\n"); break; diff --git a/src/lx/print/dump.c b/src/lx/print/dump.c index 12cbf9e2e..e693f9082 100644 --- a/src/lx/print/dump.c +++ b/src/lx/print/dump.c @@ -68,12 +68,12 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "main(int argc, char *argv[])\n"); fprintf(f, "{\n"); - fprintf(f, "\tenum lx_token t;\n"); - fprintf(f, "\tstruct lx lx = { 0 };\n"); + fprintf(f, "\tenum %stoken t;\n", prefix.api); + fprintf(f, "\tstruct %slx lx = { 0 };\n", prefix.lx); switch (opt->io) { case FSM_IO_GETC: - fprintf(f, "\tint (*lgetc)(struct lx *lx);\n"); + fprintf(f, "\tint (*lgetc)(struct %slx *lx);\n", prefix.lx); fprintf(f, "\tvoid *getc_opaque;\n"); break; @@ -135,7 +135,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) switch (api_getc) { case API_FGETC: - fprintf(f, "\tlgetc = lx_fgetc;\n"); + fprintf(f, "\tlgetc = %sfgetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = stdin;\n"); fprintf(f, "\n"); break; @@ -144,7 +144,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\ts = argv[1];\n"); fprintf(f, "\n"); - fprintf(f, "\tlgetc = lx_sgetc;\n"); + fprintf(f, "\tlgetc = %ssgetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = &s;\n"); fprintf(f, "\n"); break; @@ -154,7 +154,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tarr.len = strlen(arr.p);\n"); fprintf(f, "\n"); - fprintf(f, "\tlgetc = lx_agetc;\n"); + fprintf(f, "\tlgetc = %sagetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = &arr;\n"); fprintf(f, "\n"); break; @@ -167,13 +167,13 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\td.fd = fileno(stdin);\n"); fprintf(f, "\n"); - fprintf(f, "\tlgetc = lx_dgetc;\n"); + fprintf(f, "\tlgetc = %sdgetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = &d;\n"); fprintf(f, "\n"); break; } - fprintf(f, "\tlx_init(&lx);\n"); + fprintf(f, "\t%sinit(&lx);\n", prefix.api); fprintf(f, "\n"); switch (opt->io) { @@ -203,9 +203,9 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) if (~api_exclude & API_BUF) { fprintf(f, "\tlx.buf_opaque = &buf;\n"); - fprintf(f, "\tlx.push = lx_dynpush;\n"); - fprintf(f, "\tlx.clear = lx_dynclear;\n"); - fprintf(f, "\tlx.free = lx_dynfree;\n"); + fprintf(f, "\tlx.push = %sdynpush;\n", prefix.api); + fprintf(f, "\tlx.clear = %sdynclear;\n", prefix.api); + fprintf(f, "\tlx.free = %sdynfree;\n", prefix.api); } fprintf(f, "\n"); break; @@ -218,8 +218,8 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) if (~api_exclude & API_BUF) { fprintf(f, "\tlx.buf_opaque = &buf;\n"); - fprintf(f, "\tlx.push = lx_fixedpush;\n"); - fprintf(f, "\tlx.clear = lx_fixedclear;\n"); + fprintf(f, "\tlx.push = %sfixedpush;\n", prefix.api); + fprintf(f, "\tlx.clear = %sfixedclear;\n", prefix.api); fprintf(f, "\tlx.free = NULL;\n"); } fprintf(f, "\n"); @@ -231,7 +231,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t\tconst char *q;\n"); fprintf(f, "\n"); - fprintf(f, "\t\tt = lx_next(&lx);\n"); + fprintf(f, "\t\tt = %snext(&lx);\n", prefix.api); fprintf(f, "\n"); switch (api_tokbuf) { @@ -278,7 +278,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\n"); fprintf(f, "\t\tcase TOK_ERROR:\n"); - fprintf(f, "\t\t\tperror(\"lx_next\");\n"); + fprintf(f, "\t\t\tperror(\"%snext\");\n", prefix.api); fprintf(f, "\t\t\tbreak;\n"); fprintf(f, "\n"); @@ -291,7 +291,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t\tdefault:\n"); if (~api_exclude & API_NAME) { - fprintf(f, "\t\t\tprintf(\"<%%s\", lx_name(t));\n"); + fprintf(f, "\t\t\tprintf(\"<%%s\", %sname(t));\n", prefix.api); fprintf(f, "\t\t\tdump_buf(q, l);\n"); fprintf(f, "\t\t\tprintf(\">\\n\");\n"); } else { From 4a5ca8458daa410fef43e2303e141f09f4d7b6f7 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 4 Aug 2025 10:52:56 -0400 Subject: [PATCH 21/80] Replace FSM_ADVANCE_HOOK macro with optional hooks->advance callback. --- include/fsm/print.h | 6 ++++++ src/libfsm/print/c.c | 36 ++++++++++-------------------------- src/lx/print/c.c | 35 ++++++++++++++++++++++------------- 3 files changed, 38 insertions(+), 39 deletions(-) diff --git a/include/fsm/print.h b/include/fsm/print.h index 6fae2abea..c9ec7ec0a 100644 --- a/include/fsm/print.h +++ b/include/fsm/print.h @@ -78,6 +78,12 @@ struct fsm_hooks { const struct fsm_state_metadata *state_metadata, void *lang_opaque, void *hook_opaque); + /* If non-NULL, this will be called to generate code + * in scope immediately after advancing to the + * next character of input. */ + int (*advance)(FILE *, const struct fsm_options *opt, + const char *cur_char_var, void *hook_opaque); + int (*comment)(FILE *, const struct fsm_options *opt, const struct fsm_state_metadata *state_metadata, void *hook_opaque); diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 96117f21e..d8e12eee6 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -495,30 +495,6 @@ fsm_print_c_body(FILE *f, const struct ir *ir, * input loop was skipped it would still be NONE. */ fprintf(f, "\tint has_consumed_input = 0;\n"); - /* For FSM_IO_STR and FSM_IO_PAIR, define a macro that will be - * called with the new character every time iteration advances. - * This is used by lx's internal bookkeeping to track token - * positions in the input stream. For FSM_IO_GETC, the generated - * getc function handles this directly. - * - * This defaults to a no-op unless defined. */ - switch (opt->io) { - case FSM_IO_GETC: - break; /* nothing to do */ - - case FSM_IO_STR: - fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); - fprintf(f, "#define FSM_ADVANCE_HOOK(C) /* no-op */ (void)C\n"); - fprintf(f, "#endif\n"); - break; - - case FSM_IO_PAIR: - fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); - fprintf(f, "#define FSM_ADVANCE_HOOK(C) /* no-op */ (void)C\n"); - fprintf(f, "#endif\n"); - break; - } - /* enum of states */ print_stateenum(f, ir->n); fprintf(f, "\n"); @@ -536,13 +512,21 @@ fsm_print_c_body(FILE *f, const struct ir *ir, case FSM_IO_STR: fprintf(f, "\tfor (p = s; *p != '\\0'; p++) {\n"); fprintf(f, "\t\thas_consumed_input = 1;\n"); - fprintf(f, "\t\tFSM_ADVANCE_HOOK(%s);\n", cp); + if (hooks->advance != NULL) { + if (-1 == hooks->advance(f, opt, cp, hooks->hook_opaque)) { + return -1; + } + } break; case FSM_IO_PAIR: fprintf(f, "\tfor (p = b; p != e; p++) {\n"); fprintf(f, "\t\thas_consumed_input = 1;\n"); - fprintf(f, "\t\tFSM_ADVANCE_HOOK(%s);\n", cp); + if (hooks->advance != NULL) { + if (-1 == hooks->advance(f, opt, cp, hooks->hook_opaque)) { + return -1; + } + } break; } diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 5ca179b5a..97eb8281f 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -297,6 +297,26 @@ reject_c(FILE *f, const struct fsm_options *opt, return 0; } +static int +advance_c(FILE *f, const struct fsm_options *opt, const char *cur_char_var, void *hook_opaque) +{ + (void)hook_opaque; + + switch (opt->io) { + case FSM_IO_GETC: + break; + + case FSM_IO_STR: + case FSM_IO_PAIR: + /* When libfsm's generated code advances a character, update + * lx's token name buffer and position bookkeeping. */ + fprintf(f, "\t\tif (!%sadvance_end(lx, %s)) { return %sERROR; }\n", + prefix.api, cur_char_var, prefix.tok); + break; + } + return 0; +} + static void print_proto(FILE *f, const struct ast *ast, const struct ast_zone *z) { @@ -485,8 +505,7 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(f, "}\n"); fprintf(f, "\n"); - switch (opt->io) { - case FSM_IO_GETC: + if (opt->io == FSM_IO_GETC) { /* TODO: consider passing char *c, and return int 0/-1 for error */ if (opt->comments) { fprintf(f, "/* This wrapper manages one character of lookahead/pushback\n"); @@ -536,17 +555,6 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(f, "\treturn %sgetc((struct %slx *)getc_opaque);\n", prefix.api, prefix.lx); fprintf(f, "}\n"); fprintf(f, "\n"); - break; - - case FSM_IO_PAIR: - case FSM_IO_STR: - /* When libfsm's generated code advances a character, update - * lx's token name buffer and position bookkeeping. */ - fprintf(f, "#ifndef FSM_ADVANCE_HOOK\n"); - fprintf(f, "#define FSM_ADVANCE_HOOK(C) if (!%sadvance_end(lx, C)) { return %sERROR; }\n", prefix.api, prefix.tok); - fprintf(f, "#endif\n"); - fprintf(f, "\n"); - break; } fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); @@ -831,6 +839,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, hooks.accept = accept_c; hooks.reject = reject_c; + hooks.advance = advance_c; hooks.hook_opaque = &hook_env; fsm_print(f, z->fsm, opt, &hooks, FSM_PRINT_C); From f25e8b7d874d2bea24eaea74bc112a0d1aacde9c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 4 Aug 2025 10:56:05 -0400 Subject: [PATCH 22/80] The advance hook should also be called for FSM_IO_STR. --- src/libfsm/print/c.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index d8e12eee6..86f9855b1 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -512,24 +512,20 @@ fsm_print_c_body(FILE *f, const struct ir *ir, case FSM_IO_STR: fprintf(f, "\tfor (p = s; *p != '\\0'; p++) {\n"); fprintf(f, "\t\thas_consumed_input = 1;\n"); - if (hooks->advance != NULL) { - if (-1 == hooks->advance(f, opt, cp, hooks->hook_opaque)) { - return -1; - } - } break; case FSM_IO_PAIR: fprintf(f, "\tfor (p = b; p != e; p++) {\n"); fprintf(f, "\t\thas_consumed_input = 1;\n"); - if (hooks->advance != NULL) { - if (-1 == hooks->advance(f, opt, cp, hooks->hook_opaque)) { - return -1; - } - } break; } + if (hooks->advance != NULL) { + if (-1 == hooks->advance(f, opt, cp, hooks->hook_opaque)) { + return -1; + } + } + if (-1 == fsm_print_cfrag(f, ir, opt, hooks, cp)) { return -1; } From 051aaf0337b8951383ebbc00defbe85488f56f13 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 4 Aug 2025 10:59:33 -0400 Subject: [PATCH 23/80] Move setting `has_consumed_input` flag into lx's advance hook. This avoids cluttering libfsm's print output with `has_consumed_input`, which is specific to lx. --- src/libfsm/print/c.c | 18 ------------------ src/lx/print/c.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 86f9855b1..7f9274792 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -484,17 +484,6 @@ fsm_print_c_body(FILE *f, const struct ir *ir, } } - /* This flag indicates whether the any of the input stream was - * consumed before getting EOF and skipping over the state and - * character logic expanded here. - * - * lx needs to track this for proper EOF handling. It previously - * generated the state enum itself, so that it could include an - * additional 'NONE' state. Inside the input loop, the default - * state of NONE would be updated to the start state, but if the - * input loop was skipped it would still be NONE. */ - fprintf(f, "\tint has_consumed_input = 0;\n"); - /* enum of states */ print_stateenum(f, ir->n); fprintf(f, "\n"); @@ -506,17 +495,14 @@ fsm_print_c_body(FILE *f, const struct ir *ir, switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\twhile (c = fsm_getc(getc_opaque), c != EOF) {\n"); - fprintf(f, "\t\thas_consumed_input = 1;\n"); break; case FSM_IO_STR: fprintf(f, "\tfor (p = s; *p != '\\0'; p++) {\n"); - fprintf(f, "\t\thas_consumed_input = 1;\n"); break; case FSM_IO_PAIR: fprintf(f, "\tfor (p = b; p != e; p++) {\n"); - fprintf(f, "\t\thas_consumed_input = 1;\n"); break; } @@ -533,10 +519,6 @@ fsm_print_c_body(FILE *f, const struct ir *ir, fprintf(f, "\t}\n"); fprintf(f, "\n"); - /* Suppress unused variable warning -- this is mainly for lx. */ - fprintf(f, "\t(void)has_consumed_input;\n"); - fprintf(f, "\n"); - /* end states */ if (-1 == print_endstates(f, opt, hooks, ir)) { return -1; diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 97eb8281f..9191202df 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -302,6 +302,8 @@ advance_c(FILE *f, const struct fsm_options *opt, const char *cur_char_var, void { (void)hook_opaque; + fprintf(f, "\t\thas_consumed_input = 1;\n"); + switch (opt->io) { case FSM_IO_GETC: break; @@ -783,6 +785,17 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "z%u(struct %slx *lx)\n", zindexof(ast, z), prefix.lx); fprintf(f, "{\n"); + /* This flag indicates whether the any of the input stream was + * consumed before getting EOF and skipping over the state and + * character logic expanded here. + * + * lx needs to track this for proper EOF handling. It previously + * generated the state enum itself, so that it could include an + * additional 'NONE' state. Inside the input loop, the default + * state of NONE would be updated to the start state, but if the + * input loop was skipped it would still be NONE. */ + fprintf(f, "\tint has_consumed_input = 0;\n"); + switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\tint c;\n"); From 08fd72ccd9fff200c7d5f3921f74f49b7ab0c963 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 18 Aug 2025 17:01:13 -0400 Subject: [PATCH 24/80] lx: Avoid useless call to pop and some other 'unused' warnings. Tested with every combination of (dyn+fgetc, fixed+fgetc, pair, str), with and without '-x buf', '-x pos', or both. --- src/lx/print/c.c | 73 ++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 9191202df..77207403a 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -153,7 +153,7 @@ static void unget_character(FILE *f, bool pop, const char *cur_char_var) { fprintf(f, "%sungetc(lx, %s); ", prefix.api, cur_char_var); - if (pop && (~api_exclude & API_POS)) { + if (pop && (~api_exclude & API_BUF)) { fprintf(f, "%s%spop(lx->buf_opaque); ", prefix.api, buf_op_prefix()); } @@ -312,8 +312,10 @@ advance_c(FILE *f, const struct fsm_options *opt, const char *cur_char_var, void case FSM_IO_PAIR: /* When libfsm's generated code advances a character, update * lx's token name buffer and position bookkeeping. */ - fprintf(f, "\t\tif (!%sadvance_end(lx, %s)) { return %sERROR; }\n", - prefix.api, cur_char_var, prefix.tok); + if (~api_exclude & API_POS) { + fprintf(f, "\t\tif (!%sadvance_end(lx, %s)) { return %sERROR; }\n", + prefix.api, cur_char_var, prefix.tok); + } break; } return 0; @@ -473,40 +475,47 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(stderr, " io"); } - fprintf(f, "static int\n"); - fprintf(f, "%sadvance_end(struct %slx *lx, int c)\n", prefix.api, prefix.lx); - fprintf(f, "{\n"); - if (~api_exclude & API_POS) { - fprintf(f, "\tlx->end.byte++;\n"); - fprintf(f, "\tlx->end.col++;\n"); + if (opt->io == FSM_IO_GETC || (~api_exclude & API_POS)) { + fprintf(f, "static int\n"); + fprintf(f, "%sadvance_end(struct %slx *lx, int c)\n", prefix.api, prefix.lx); + fprintf(f, "{\n"); - fprintf(f, "\tif (c == '\\n') {\n"); - fprintf(f, "\t\tlx->end.line++;\n"); - fprintf(f, "\t\tlx->end.saved_col = lx->end.col - 1;\n"); - fprintf(f, "\t\tlx->end.col = 1;\n"); - - if (opt->io == FSM_IO_STR) { /* ignore terminating '\0' */ - fprintf(f, "\t} else if (c == '\\0') { /* don't count terminating '\\0' */\n"); - fprintf(f, "\t\tlx->end.byte--;\n"); - fprintf(f, "\t\tlx->end.col--;\n"); - fprintf(f, "\t}\n"); + if (api_exclude & API_POS) { + fprintf(f, "\t(void)lx; (void)c;\n"); + } else { + fprintf(f, "\tlx->end.byte++;\n"); + fprintf(f, "\tlx->end.col++;\n"); + + fprintf(f, "\tif (c == '\\n') {\n"); + fprintf(f, "\t\tlx->end.line++;\n"); + fprintf(f, "\t\tlx->end.saved_col = lx->end.col - 1;\n"); + fprintf(f, "\t\tlx->end.col = 1;\n"); + + if (opt->io == FSM_IO_STR) { /* ignore terminating '\0' */ + fprintf(f, "\t} else if (c == '\\0') { /* don't count terminating '\\0' */\n"); + fprintf(f, "\t\tlx->end.byte--;\n"); + fprintf(f, "\t\tlx->end.col--;\n"); + fprintf(f, "\t}\n"); + } else { + fprintf(f, "\t}\n"); + } + } + + if (api_exclude & API_BUF) { + fprintf(f, "\t(void)lx; (void)c;\n"); } else { + fprintf(f, "\tif (lx->push != NULL) {\n"); + fprintf(f, "\t\tif (-1 == lx->push(lx->buf_opaque, (char)c)) {\n"); + fprintf(f, "\t\t\treturn 0;\n"); + fprintf(f, "\t\t}\n"); fprintf(f, "\t}\n"); } - } - if (~api_exclude & API_BUF) { - fprintf(f, "\tif (lx->push != NULL) {\n"); - fprintf(f, "\t\tif (-1 == lx->push(lx->buf_opaque, (char)c)) {\n"); - fprintf(f, "\t\t\treturn 0;\n"); - fprintf(f, "\t\t}\n"); - fprintf(f, "\t}\n"); + fprintf(f, "\treturn 1;\n"); + fprintf(f, "}\n"); + fprintf(f, "\n"); } - fprintf(f, "\treturn 1;\n"); - fprintf(f, "}\n"); - fprintf(f, "\n"); - if (opt->io == FSM_IO_GETC) { /* TODO: consider passing char *c, and return int 0/-1 for error */ if (opt->comments) { @@ -582,7 +591,9 @@ print_io(FILE *f, const struct fsm_options *opt) break; } - if (~api_exclude & API_POS) { + if (api_exclude & API_POS) { + fprintf(f, "\t(void)lx; (void)c;\n"); + } else { fprintf(f, "\tlx->end.byte--;\n"); fprintf(f, "\tlx->end.col--;\n"); fprintf(f, "\n"); From c853157a6a640e723eaacd3c5f9e7caa07bdfcc6 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 26 Aug 2025 14:54:42 -0400 Subject: [PATCH 25/80] lx: Distinguish between unexpected EOF and EOF in ignored zones. PR #509 introduced a bug: It didn't distinguish between an unexpected end of input and an end of input in a zone that matches but ignores its input. This caused several lxpos tests to fail due to getting a TOK_UNKNOWN rather than a TOK_EOF when the input has trailing whitespace, but I didn't notice until after merging because the normal build doesn't regenerate the code for src/lx/lexer.lx or src/libfsm/lexer.lx. (I had ensured all the libre dialect lexers and parsers were regenerated, but missed those.) Instead of always printing TOK_UNKNOWN, this this inspects the zone mappings to determine whether the current end ID represents a dead end for the zone. If not, it should instead print TOK_EOF. --- src/lx/print/c.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/lx/print/c.c b/src/lx/print/c.c index 77207403a..3340f0d74 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -159,6 +159,28 @@ unget_character(FILE *f, bool pop, const char *cur_char_var) } } +static bool +endid_represents_dead_end(fsm_end_id_t endid, const struct ast *ast) +{ + const struct ast_mapping *m = ast_getendmappingbyendid(endid); + if (m == NULL) { + return false; + } + + /* For each zone, check if this endid is associated with its z->ml zone. + * If so, that endid is the "dead end" for that zone. + * + * The total number of zones and end ids (each corresponding to mapping) + * should stay small enough that linear search is fine. If this becomes + * prohibitively expensive, then build a bitset of dead-end IDs upfront + * in one pass. */ + for (struct ast_zone *z = ast->zl; z != NULL; z = z->next) { + if (z->ml == m) { return true; } + } + + return false; +} + static int accept_c(FILE *f, const struct fsm_options *opt, const struct fsm_state_metadata *state_metadata, @@ -193,10 +215,21 @@ accept_c(FILE *f, const struct fsm_options *opt, if (m->to == NULL) { if (m->token == NULL) { /* If accept-ing here doesn't actually map to a token or - * a different zone, then it's stuck in the middle of a - * pattern pair like `'//' .. /\n/ -> $nl;` with an EOF, - * so tokenization should still fail. */ - fprintf(f, "%sUNKNOWN", prefix.tok); + * a different zone, then check whether the endid represents + * a dead end. In that case, it's stuck in the middle of a + * pattern pair like `'//' .. /\n/ -> $nl;` with an unexpected + * EOF, so tokenization should still fail (with TOK_UNKNOWN). + * + * An example where the endid doesn't represent a dead end is + * a zone ignoring trailing whitespace in a file, such as + * `/[\r\n\t ]+/;`. In that case, the EOF is valid, so still + * return TOK_EOF. */ + const fsm_end_id_t endid = state_metadata->end_ids[0]; + if (endid_represents_dead_end(endid, ast)) { + fprintf(f, "%sUNKNOWN", prefix.tok); + } else { + fprintf(f, "%sEOF", prefix.tok); + } } else { /* yield a token */ fprintf(f, "%s", prefix.tok); From 9be4aecb1625defde993db68524dd863873a0ffb Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 26 Aug 2025 15:02:42 -0400 Subject: [PATCH 26/80] Generated code. Re-generate lexers and parsers with lx bug fixed. --- src/libfsm/lexer.c | 475 +++++--- src/libfsm/parser.c | 74 +- src/libfsm/parser.h | 4 +- src/libre/dialect/glob/lexer.c | 106 +- src/libre/dialect/glob/parser.c | 38 +- src/libre/dialect/glob/parser.h | 4 +- src/libre/dialect/like/lexer.c | 104 +- src/libre/dialect/like/parser.c | 38 +- src/libre/dialect/like/parser.h | 4 +- src/libre/dialect/literal/lexer.c | 96 +- src/libre/dialect/literal/parser.c | 26 +- src/libre/dialect/literal/parser.h | 4 +- src/libre/dialect/native/lexer.c | 490 +++++--- src/libre/dialect/native/parser.c | 204 ++-- src/libre/dialect/native/parser.h | 4 +- src/libre/dialect/pcre/lexer.c | 1810 +++++++++++++++++----------- src/libre/dialect/pcre/parser.c | 318 ++--- src/libre/dialect/pcre/parser.h | 4 +- src/libre/dialect/sql/lexer.c | 366 ++++-- src/libre/dialect/sql/parser.c | 154 +-- src/libre/dialect/sql/parser.h | 4 +- src/lx/lexer.c | 679 ++++++----- src/lx/parser.c | 182 +-- src/lx/parser.h | 4 +- 24 files changed, 3029 insertions(+), 2163 deletions(-) diff --git a/src/libfsm/lexer.c b/src/libfsm/lexer.c index 8bd374cec..3bf26b3b6 100644 --- a/src/libfsm/lexer.c +++ b/src/libfsm/lexer.c @@ -15,6 +15,26 @@ static enum lx_token z3(struct lx *lx); static enum lx_token z4(struct lx *lx); static enum lx_token z5(struct lx *lx); +static int +lx_advance_end(struct lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif @@ -35,18 +55,19 @@ lx_getc(struct lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_getc((struct lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -55,10 +76,7 @@ lx_ungetc(struct lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -68,13 +86,20 @@ lx_ungetc(struct lx *lx, int c) } } +/* Get a character from fgetc and push it to the buffer */ int lx_fgetc(struct lx *lx) { assert(lx != NULL); assert(lx->getc_opaque != NULL); - return fgetc(lx->getc_opaque); + const int c = fgetc(lx->getc_opaque); + if (c == EOF) { + lx->c = EOF; + return EOF; + } else { + return c; + } } int @@ -119,6 +144,17 @@ lx_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_dynclear(void *buf_opaque) { @@ -158,44 +194,53 @@ lx_dynfree(void *buf_opaque) static enum lx_token z0(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\n': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return lx->z(lx); + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); case S2: /* e.g. "" */ - lx_ungetc(lx, c); return lx->z = z1, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z1, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return lx->z = z1, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S0: @@ -212,44 +257,40 @@ z0(struct lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z1(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '0': case '1': @@ -268,7 +309,9 @@ z1(struct lx *lx) case '\r': case ' ': state = S4; break; case ']': state = S5; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -284,15 +327,15 @@ z1(struct lx *lx) case '7': case '8': case '9': break; - default: lx_ungetc(lx, c); return TOK_ENDID; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ENDID; } break; case S2: /* e.g. "," */ - lx_ungetc(lx, c); return TOK_COMMA; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_COMMA; case S3: /* e.g. "#" */ - lx_ungetc(lx, c); return lx->z = z0, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z0, lx->z(lx); case S4: /* e.g. "\\x09" */ switch ((unsigned char) c) { @@ -300,16 +343,29 @@ z1(struct lx *lx) case '\n': case '\r': case ' ': break; - default: lx_ungetc(lx, c); return lx->z(lx); + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); } break; case S5: /* e.g. "]" */ - lx_ungetc(lx, c); return lx->z = z5, TOK_CLOSEENDIDS; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, TOK_CLOSEENDIDS; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_ENDID; + case S2: return TOK_COMMA; + case S3: return lx->z = z0, lx->z(lx); + case S4: return TOK_EOF; + case S5: return lx->z = z5, TOK_CLOSEENDIDS; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S3: @@ -325,106 +381,105 @@ z1(struct lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_ENDID; - case S2: return TOK_COMMA; - case S3: return TOK_EOF; - case S4: return TOK_EOF; - case S5: return TOK_CLOSEENDIDS; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z2(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\'': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z5, TOK_LABEL; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, TOK_LABEL; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return lx->z = z5, TOK_LABEL; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_LABEL; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z3(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; case '"': state = S3; break; @@ -450,15 +505,15 @@ z3(struct lx *lx) case 'r': case 't': case 'v': state = S6; break; - default: lx_ungetc(lx, c); return TOK_CHAR; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S2: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S2: /* e.g. "\\x00" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z5, TOK_LABEL; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, TOK_LABEL; case S4: /* e.g. "\\x" */ switch ((unsigned char) c) { @@ -484,7 +539,9 @@ z3(struct lx *lx) case 'd': case 'e': case 'f': state = S7; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -498,14 +555,14 @@ z3(struct lx *lx) case '5': case '6': case '7': break; - default: lx_ungetc(lx, c); return TOK_OCT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S6: /* e.g. "\\f" */ - lx_ungetc(lx, c); return TOK_ESC; + case S6: /* e.g. "\\\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ESC; - case S7: /* e.g. "\\xa" */ + case S7: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -529,76 +586,92 @@ z3(struct lx *lx) case 'd': case 'e': case 'f': break; - default: lx_ungetc(lx, c); return TOK_HEX; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_HEX; } break; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; - case S3: return TOK_LABEL; + case S3: return lx->z = z5, TOK_LABEL; case S5: return TOK_OCT; case S6: return TOK_ESC; case S7: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z4(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\n': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return lx->z(lx); + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); case S2: /* e.g. "" */ - lx_ungetc(lx, c); return lx->z = z5, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return lx->z = z5, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S0: @@ -615,46 +688,42 @@ z4(struct lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z5(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case ',': state = S1; break; case ';': state = S2; break; @@ -732,31 +801,35 @@ z5(struct lx *lx) case '\n': case '\r': case ' ': state = S13; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S1: /* e.g. "," */ - lx_ungetc(lx, c); return TOK_COMMA; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_COMMA; case S2: /* e.g. ";" */ - lx_ungetc(lx, c); return TOK_SEP; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_SEP; - case S3: /* e.g. "?" */ - lx_ungetc(lx, c); return TOK_ANY; + case S3: /* e.g. "\077" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ANY; case S4: /* e.g. "-" */ switch ((unsigned char) c) { case '>': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S5: /* e.g. "[" */ - lx_ungetc(lx, c); return lx->z = z1, TOK_OPENENDIDS; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENENDIDS; case S6: /* e.g. "=" */ - lx_ungetc(lx, c); return TOK_EQUALS; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_EQUALS; case S7: /* e.g. "e" */ switch ((unsigned char) c) { @@ -823,7 +896,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'n': state = S19; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -892,11 +965,11 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 't': state = S14; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; - case S9: /* e.g. "a" */ + case S9: /* e.g. "0" */ switch ((unsigned char) c) { case '0': case '1': @@ -961,18 +1034,18 @@ z5(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; case S10: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z2, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z2, lx->z(lx); case S11: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z3, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z3, lx->z(lx); case S12: /* e.g. "#" */ - lx_ungetc(lx, c); return lx->z = z4, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, lx->z(lx); case S13: /* e.g. "\\x09" */ switch ((unsigned char) c) { @@ -980,7 +1053,7 @@ z5(struct lx *lx) case '\n': case '\r': case ' ': break; - default: lx_ungetc(lx, c); return lx->z(lx); + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); } break; @@ -1049,7 +1122,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'a': state = S15; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1118,7 +1191,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'r': state = S16; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1187,7 +1260,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 't': state = S17; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1257,12 +1330,12 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case ':': state = S18; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; case S18: /* e.g. "start:" */ - lx_ungetc(lx, c); return TOK_START; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_START; case S19: /* e.g. "en" */ switch ((unsigned char) c) { @@ -1329,7 +1402,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'd': state = S20; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1399,53 +1472,34 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case ':': state = S21; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; case S21: /* e.g. "end:" */ - lx_ungetc(lx, c); return TOK_END; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_END; case S22: /* e.g. "->" */ - lx_ungetc(lx, c); return TOK_TO; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TO; default: ; /* unreached */ } - - switch (state) { - case S10: - case S11: - case S12: - case S13: - break; - - default: - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } - break; - - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_COMMA; case S2: return TOK_SEP; case S3: return TOK_ANY; - case S5: return TOK_OPENENDIDS; + case S5: return lx->z = z1, TOK_OPENENDIDS; case S6: return TOK_EQUALS; case S7: return TOK_IDENT; case S8: return TOK_IDENT; case S9: return TOK_IDENT; - case S10: return TOK_EOF; - case S11: return TOK_EOF; - case S12: return TOK_EOF; + case S10: return lx->z = z2, lx->z(lx); + case S11: return lx->z = z3, lx->z(lx); + case S12: return lx->z = z4, lx->z(lx); case S13: return TOK_EOF; case S14: return TOK_IDENT; case S15: return TOK_IDENT; @@ -1456,8 +1510,34 @@ z5(struct lx *lx) case S20: return TOK_IDENT; case S21: return TOK_END; case S22: return TOK_TO; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + switch (state) { + case S10: + case S11: + case S12: + case S13: + break; + + default: + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + break; + + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -1640,6 +1720,7 @@ lx_init(struct lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_dynpop; } enum lx_token diff --git a/src/libfsm/parser.c b/src/libfsm/parser.c index e4ac8a31b..ec9bf4f78 100644 --- a/src/libfsm/parser.c +++ b/src/libfsm/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 153 "src/libfsm/parser.act" +#line 27 "src/libfsm/parser.act" #include @@ -179,7 +179,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: CHAR */ { -#line 247 "src/libfsm/parser.act" +#line 243 "src/libfsm/parser.act" assert(lex_state->buf.a[0] != '\0'); assert(lex_state->buf.a[1] == '\0'); @@ -196,7 +196,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: ESC */ { -#line 171 "src/libfsm/parser.act" +#line 167 "src/libfsm/parser.act" assert(0 == strncmp(lex_state->buf.a, "\\", 1)); assert(2 == strlen(lex_state->buf.a)); @@ -224,7 +224,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: HEX */ { -#line 240 "src/libfsm/parser.act" +#line 214 "src/libfsm/parser.act" unsigned long u; char *e; @@ -263,7 +263,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: OCT */ { -#line 211 "src/libfsm/parser.act" +#line 185 "src/libfsm/parser.act" unsigned long u; char *e; @@ -338,7 +338,7 @@ ZL2_items:; case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIa = xstrdup(lex_state->buf.a); @@ -366,7 +366,7 @@ ZL2_items:; goto ZL2_items; /* END OF INLINE: items */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -451,7 +451,7 @@ ZL2_xend_C_Cend_Hstates:; } /* END OF INLINE: xend::end-states */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -489,7 +489,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) { /* BEGINNING OF ACTION: err-expected-start */ { -#line 404 "src/libfsm/parser.act" +#line 402 "src/libfsm/parser.act" err_expected(lex_state, "'start:'"); @@ -507,7 +507,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIn = xstrdup(lex_state->buf.a); @@ -530,7 +530,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) } /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((ZIn)); @@ -588,7 +588,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: mark-start */ { -#line 336 "src/libfsm/parser.act" +#line 335 "src/libfsm/parser.act" fsm_setstart(fsm, (ZIs)); @@ -597,7 +597,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) /* END OF ACTION: mark-start */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((ZIn)); @@ -639,7 +639,7 @@ p_xend(fsm fsm, lex_state lex_state, act_state act_state) { /* BEGINNING OF ACTION: err-expected-end */ { -#line 408 "src/libfsm/parser.act" +#line 406 "src/libfsm/parser.act" err_expected(lex_state, "'end:'"); @@ -687,7 +687,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIn = xstrdup(lex_state->buf.a); @@ -705,7 +705,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z /* END OF INLINE: ident */ /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((ZIn)); @@ -763,7 +763,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: mark-end */ { -#line 340 "src/libfsm/parser.act" +#line 339 "src/libfsm/parser.act" fsm_setend(fsm, (ZIs), 1); @@ -772,7 +772,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z /* END OF ACTION: mark-end */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((ZIn)); @@ -810,7 +810,7 @@ p_fsm(fsm fsm, lex_state lex_state, act_state act_state) ADVANCE_LEXER; /* BEGINNING OF ACTION: free-statelist */ { -#line 366 "src/libfsm/parser.act" +#line 353 "src/libfsm/parser.act" struct act_statelist *p; struct act_statelist *next; @@ -834,7 +834,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-syntax */ { -#line 413 "src/libfsm/parser.act" +#line 410 "src/libfsm/parser.act" err(lex_state, "Syntax error"); exit(EXIT_FAILURE); @@ -865,7 +865,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-comma */ { -#line 400 "src/libfsm/parser.act" +#line 398 "src/libfsm/parser.act" err_expected(lex_state, "','"); @@ -895,7 +895,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-sep */ { -#line 392 "src/libfsm/parser.act" +#line 390 "src/libfsm/parser.act" err_expected(lex_state, "';'"); @@ -923,7 +923,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIb = xstrdup(lex_state->buf.a); @@ -941,7 +941,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF INLINE: ident */ /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((*ZIa)); @@ -999,7 +999,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((ZIb)); @@ -1057,7 +1057,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((*ZIa)); @@ -1066,7 +1066,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: free */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((ZIb)); @@ -1081,7 +1081,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) ADVANCE_LEXER; /* BEGINNING OF ACTION: add-edge-any */ { -#line 376 "src/libfsm/parser.act" +#line 375 "src/libfsm/parser.act" if (!fsm_addedge_any(fsm, (ZIx), (ZIy))) { perror("fsm_addedge_any"); @@ -1104,7 +1104,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) } /* BEGINNING OF ACTION: add-edge-literal */ { -#line 369 "src/libfsm/parser.act" +#line 368 "src/libfsm/parser.act" if (!fsm_addedge_literal(fsm, (ZIx), (ZIy), (ZIc))) { perror("fsm_addedge_literal"); @@ -1120,7 +1120,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) { /* BEGINNING OF ACTION: add-edge-epsilon */ { -#line 383 "src/libfsm/parser.act" +#line 382 "src/libfsm/parser.act" if (!fsm_addedge_epsilon(fsm, (ZIx), (ZIy))) { perror("fsm_addedge_epsilon"); @@ -1138,7 +1138,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) { /* BEGINNING OF ACTION: err-expected-trans */ { -#line 396 "src/libfsm/parser.act" +#line 394 "src/libfsm/parser.act" err_expected(lex_state, "transition"); @@ -1162,7 +1162,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((*ZIa)); @@ -1220,7 +1220,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((*ZIa)); @@ -1269,7 +1269,7 @@ ZL2_xend_C_Cend_Hids:; } /* END OF INLINE: xend::end-ids */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): RESTORE_LEXER; goto ZL1; @@ -1298,7 +1298,7 @@ p_xend_C_Cend_Hid(fsm fsm, lex_state lex_state, act_state act_state, state ZIs) case (TOK_ENDID): /* BEGINNING OF EXTRACT: ENDID */ { -#line 277 "src/libfsm/parser.act" +#line 255 "src/libfsm/parser.act" unsigned long u; char *e; @@ -1333,7 +1333,7 @@ p_xend_C_Cend_Hid(fsm fsm, lex_state lex_state, act_state act_state, state ZIs) ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-end-id */ { -#line 344 "src/libfsm/parser.act" +#line 343 "src/libfsm/parser.act" if (!fsm_endid_set(fsm, (ZIs), (ZIid))) { goto ZL1; @@ -1351,7 +1351,7 @@ ZL1:; /* BEGINNING OF TRAILER */ -#line 479 "src/libfsm/parser.act" +#line 415 "src/libfsm/parser.act" struct fsm *fsm_parse(FILE *f, const struct fsm_alloc *alloc) { diff --git a/src/libfsm/parser.h b/src/libfsm/parser.h index edeebb112..32f562c66 100644 --- a/src/libfsm/parser.h +++ b/src/libfsm/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 163 "src/libfsm/parser.act" +#line 153 "src/libfsm/parser.act" typedef struct lex_state * lex_state; @@ -27,7 +27,7 @@ extern void p_fsm(fsm, lex_state, act_state); /* BEGINNING OF TRAILER */ -#line 480 "src/libfsm/parser.act" +#line 479 "src/libfsm/parser.act" #line 33 "src/libfsm/parser.h" diff --git a/src/libre/dialect/glob/lexer.c b/src/libre/dialect/glob/lexer.c index 843cedc4e..2a4d33a29 100644 --- a/src/libre/dialect/glob/lexer.c +++ b/src/libre/dialect/glob/lexer.c @@ -10,11 +10,31 @@ static enum lx_glob_token z0(struct lx_glob_lx *lx); +static int +lx_glob_advance_end(struct lx_glob_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_glob_lx *lx) +lx_glob_getc(struct lx_glob_lx *lx) { int c; @@ -30,18 +50,19 @@ lx_getc(struct lx_glob_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_glob_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_glob_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_glob_getc((struct lx_glob_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -50,10 +71,7 @@ lx_glob_ungetc(struct lx_glob_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -105,6 +123,17 @@ lx_glob_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_glob_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_glob_dynclear(void *buf_opaque) { @@ -144,29 +173,28 @@ lx_glob_dynfree(void *buf_opaque) static enum lx_glob_token z0(struct lx_glob_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '*': state = S2; break; case '?': state = S3; break; @@ -175,34 +203,41 @@ z0(struct lx_glob_lx *lx) break; case S1: /* e.g. "\\x00" */ - lx_glob_ungetc(lx, c); return TOK_CHAR; + lx_glob_ungetc(lx, c); lx_glob_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "*" */ - lx_glob_ungetc(lx, c); return TOK_MANY; + lx_glob_ungetc(lx, c); lx_glob_dynpop(lx->buf_opaque); return TOK_MANY; - case S3: /* e.g. "?" */ - lx_glob_ungetc(lx, c); return TOK_ANY; + case S3: /* e.g. "\077" */ + lx_glob_ungetc(lx, c); lx_glob_dynpop(lx->buf_opaque); return TOK_ANY; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_MANY; + case S3: return TOK_ANY; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_glob_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_MANY; - case S3: return TOK_ANY; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -254,6 +289,7 @@ lx_glob_init(struct lx_glob_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_glob_dynpop; } enum lx_glob_token diff --git a/src/libre/dialect/glob/parser.c b/src/libre/dialect/glob/parser.c index c8f021380..b20798f4f 100644 --- a/src/libre/dialect/glob/parser.c +++ b/src/libre/dialect/glob/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -304,7 +304,7 @@ ZL2_list_Hof_Hatoms:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -322,7 +322,7 @@ ZL2_list_Hof_Hatoms:; goto ZL2_list_Hof_Hatoms; /* END OF INLINE: list-of-atoms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -348,7 +348,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -358,7 +358,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -378,7 +378,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -397,7 +397,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIe) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZIe) == NULL) { @@ -418,7 +418,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -428,7 +428,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIg) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIg) == NULL) { @@ -440,7 +440,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -449,7 +449,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: count-zero-or-more */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); @@ -478,7 +478,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -490,7 +490,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -523,7 +523,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -537,7 +537,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIe) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -558,7 +558,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -575,7 +575,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* END OF INLINE: 119 */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -601,7 +601,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -626,7 +626,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/glob/parser.h b/src/libre/dialect/glob/parser.h index ec618caca..89800e6e9 100644 --- a/src/libre/dialect/glob/parser.h +++ b/src/libre/dialect/glob/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__glob(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/glob/parser.h" diff --git a/src/libre/dialect/like/lexer.c b/src/libre/dialect/like/lexer.c index 4f4dcbdab..2edc365a9 100644 --- a/src/libre/dialect/like/lexer.c +++ b/src/libre/dialect/like/lexer.c @@ -10,11 +10,31 @@ static enum lx_like_token z0(struct lx_like_lx *lx); +static int +lx_like_advance_end(struct lx_like_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_like_lx *lx) +lx_like_getc(struct lx_like_lx *lx) { int c; @@ -30,18 +50,19 @@ lx_getc(struct lx_like_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_like_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_like_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_like_getc((struct lx_like_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -50,10 +71,7 @@ lx_like_ungetc(struct lx_like_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -105,6 +123,17 @@ lx_like_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_like_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_like_dynclear(void *buf_opaque) { @@ -144,29 +173,28 @@ lx_like_dynfree(void *buf_opaque) static enum lx_like_token z0(struct lx_like_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '%': state = S2; break; case '_': state = S3; break; @@ -175,34 +203,41 @@ z0(struct lx_like_lx *lx) break; case S1: /* e.g. "\\x00" */ - lx_like_ungetc(lx, c); return TOK_CHAR; + lx_like_ungetc(lx, c); lx_like_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "%" */ - lx_like_ungetc(lx, c); return TOK_MANY; + lx_like_ungetc(lx, c); lx_like_dynpop(lx->buf_opaque); return TOK_MANY; case S3: /* e.g. "_" */ - lx_like_ungetc(lx, c); return TOK_ANY; + lx_like_ungetc(lx, c); lx_like_dynpop(lx->buf_opaque); return TOK_ANY; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_MANY; + case S3: return TOK_ANY; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_like_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_MANY; - case S3: return TOK_ANY; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -254,6 +289,7 @@ lx_like_init(struct lx_like_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_like_dynpop; } enum lx_like_token diff --git a/src/libre/dialect/like/parser.c b/src/libre/dialect/like/parser.c index 64bfe1078..008bd2b04 100644 --- a/src/libre/dialect/like/parser.c +++ b/src/libre/dialect/like/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -304,7 +304,7 @@ ZL2_list_Hof_Hatoms:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -322,7 +322,7 @@ ZL2_list_Hof_Hatoms:; goto ZL2_list_Hof_Hatoms; /* END OF INLINE: list-of-atoms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -348,7 +348,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -358,7 +358,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -378,7 +378,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -397,7 +397,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIe) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZIe) == NULL) { @@ -418,7 +418,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -428,7 +428,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIg) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIg) == NULL) { @@ -440,7 +440,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -449,7 +449,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: count-zero-or-more */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); @@ -478,7 +478,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -490,7 +490,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -523,7 +523,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -537,7 +537,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIe) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -558,7 +558,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -575,7 +575,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* END OF INLINE: 119 */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -601,7 +601,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -626,7 +626,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/like/parser.h b/src/libre/dialect/like/parser.h index f6c87ad7b..5294f9792 100644 --- a/src/libre/dialect/like/parser.h +++ b/src/libre/dialect/like/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__like(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/like/parser.h" diff --git a/src/libre/dialect/literal/lexer.c b/src/libre/dialect/literal/lexer.c index f4ff77a37..f13bdbc6f 100644 --- a/src/libre/dialect/literal/lexer.c +++ b/src/libre/dialect/literal/lexer.c @@ -10,11 +10,31 @@ static enum lx_literal_token z0(struct lx_literal_lx *lx); +static int +lx_literal_advance_end(struct lx_literal_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_literal_lx *lx) +lx_literal_getc(struct lx_literal_lx *lx) { int c; @@ -30,18 +50,19 @@ lx_getc(struct lx_literal_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_literal_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_literal_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_literal_getc((struct lx_literal_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -50,10 +71,7 @@ lx_literal_ungetc(struct lx_literal_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -105,6 +123,17 @@ lx_literal_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_literal_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_literal_dynclear(void *buf_opaque) { @@ -144,52 +173,58 @@ lx_literal_dynfree(void *buf_opaque) static enum lx_literal_token z0(struct lx_literal_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ state = S1; break; case S1: /* e.g. "" */ - lx_literal_ungetc(lx, c); return TOK_CHAR; + lx_literal_ungetc(lx, c); lx_literal_dynpop(lx->buf_opaque); return TOK_CHAR; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_literal_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -237,6 +272,7 @@ lx_literal_init(struct lx_literal_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_literal_dynpop; } enum lx_literal_token diff --git a/src/libre/dialect/literal/parser.c b/src/libre/dialect/literal/parser.c index 44547716b..5d1dc82f7 100644 --- a/src/libre/dialect/literal/parser.c +++ b/src/libre/dialect/literal/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -304,7 +304,7 @@ ZL2_list_Hof_Hatoms:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -322,7 +322,7 @@ ZL2_list_Hof_Hatoms:; goto ZL2_list_Hof_Hatoms; /* END OF INLINE: list-of-atoms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -349,7 +349,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -363,7 +363,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIe) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -384,7 +384,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -401,7 +401,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ /* END OF INLINE: 117 */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -427,7 +427,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -467,7 +467,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, case (TOK_CHAR): /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -490,7 +490,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -506,7 +506,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -518,7 +518,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -539,7 +539,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/literal/parser.h b/src/libre/dialect/literal/parser.h index be58db4ea..7f90a15ef 100644 --- a/src/libre/dialect/literal/parser.h +++ b/src/libre/dialect/literal/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__literal(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/literal/parser.h" diff --git a/src/libre/dialect/native/lexer.c b/src/libre/dialect/native/lexer.c index 2399683ac..b18634004 100644 --- a/src/libre/dialect/native/lexer.c +++ b/src/libre/dialect/native/lexer.c @@ -12,11 +12,31 @@ static enum lx_native_token z0(struct lx_native_lx *lx); static enum lx_native_token z1(struct lx_native_lx *lx); static enum lx_native_token z2(struct lx_native_lx *lx); +static int +lx_native_advance_end(struct lx_native_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_native_lx *lx) +lx_native_getc(struct lx_native_lx *lx) { int c; @@ -32,18 +52,19 @@ lx_getc(struct lx_native_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_native_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_native_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_native_getc((struct lx_native_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -52,10 +73,7 @@ lx_native_ungetc(struct lx_native_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -107,6 +125,17 @@ lx_native_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_native_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_native_dynclear(void *buf_opaque) { @@ -146,29 +175,28 @@ lx_native_dynfree(void *buf_opaque) static enum lx_native_token z0(struct lx_native_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '0': case '1': @@ -182,7 +210,9 @@ z0(struct lx_native_lx *lx) case '9': state = S1; break; case ',': state = S2; break; case '}': state = S3; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -198,69 +228,75 @@ z0(struct lx_native_lx *lx) case '7': case '8': case '9': break; - default: lx_native_ungetc(lx, c); return TOK_COUNT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_COUNT; } break; case S2: /* e.g. "," */ - lx_native_ungetc(lx, c); return TOK_SEP; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_SEP; case S3: /* e.g. "}" */ - lx_native_ungetc(lx, c); return lx->z = z2, TOK_CLOSECOUNT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSECOUNT; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_COUNT; + case S2: return TOK_SEP; + case S3: return lx->z = z2, TOK_CLOSECOUNT; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_COUNT; - case S2: return TOK_SEP; - case S3: return TOK_CLOSECOUNT; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_native_token z1(struct lx_native_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, - S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, - S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, - S50, S51, S52, S53, S54, S55, S56, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, + S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, + S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, + S50, S51, S52, S53, S54, S55, S56 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '[': state = S1; break; case '\\': state = S3; break; @@ -273,12 +309,12 @@ z1(struct lx_native_lx *lx) case S1: /* e.g. "[" */ switch ((unsigned char) c) { case ':': state = S12; break; - default: lx_native_ungetc(lx, c); return TOK_CHAR; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; } break; case S2: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_CHAR; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\\" */ switch ((unsigned char) c) { @@ -301,25 +337,27 @@ z1(struct lx_native_lx *lx) case '5': case '6': case '7': state = S9; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S4: /* e.g. "-" */ switch ((unsigned char) c) { case ']': state = S6; break; - default: lx_native_ungetc(lx, c); return TOK_RANGE; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_RANGE; } break; case S5: /* e.g. "]" */ - lx_native_ungetc(lx, c); return lx->z = z2, TOK_CLOSEGROUP; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSEGROUP; case S6: /* e.g. "-]" */ - lx_native_ungetc(lx, c); return lx->z = z2, TOK_CLOSEGROUPRANGE; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSEGROUPRANGE; case S7: /* e.g. "\\-" */ - lx_native_ungetc(lx, c); return TOK_ESC; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ESC; case S8: /* e.g. "\\x" */ switch ((unsigned char) c) { @@ -345,7 +383,9 @@ z1(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S10; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -359,7 +399,7 @@ z1(struct lx_native_lx *lx) case '5': case '6': case '7': break; - default: lx_native_ungetc(lx, c); return TOK_OCT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; } break; @@ -387,12 +427,14 @@ z1(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S11; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S11: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_HEX; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_HEX; case S12: /* e.g. "[:" */ switch ((unsigned char) c) { @@ -408,35 +450,45 @@ z1(struct lx_native_lx *lx) case 'w': state = S21; break; case 'u': state = S22; break; case 'p': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S13: /* e.g. "[:d" */ switch ((unsigned char) c) { case 'i': state = S55; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S14: /* e.g. "[:s" */ switch ((unsigned char) c) { case 'p': state = S52; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:h" */ switch ((unsigned char) c) { case 's': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S16: /* e.g. "[:g" */ switch ((unsigned char) c) { case 'r': state = S49; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -444,42 +496,54 @@ z1(struct lx_native_lx *lx) switch ((unsigned char) c) { case 's': state = S41; break; case 'l': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S18: /* e.g. "[:c" */ switch ((unsigned char) c) { case 'n': state = S38; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S19: /* e.g. "[:l" */ switch ((unsigned char) c) { case 'o': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S20: /* e.g. "[:x" */ switch ((unsigned char) c) { case 'd': state = S13; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S21: /* e.g. "[:w" */ switch ((unsigned char) c) { case 'o': state = S35; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S22: /* e.g. "[:u" */ switch ((unsigned char) c) { case 'p': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -487,129 +551,165 @@ z1(struct lx_native_lx *lx) switch ((unsigned char) c) { case 'r': state = S24; break; case 'u': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S24: /* e.g. "[:pr" */ switch ((unsigned char) c) { case 'i': state = S31; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S25: /* e.g. "[:pu" */ switch ((unsigned char) c) { case 'n': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S26: /* e.g. "[:pun" */ switch ((unsigned char) c) { case 'c': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S27: /* e.g. "[:digi" */ switch ((unsigned char) c) { case 't': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S28: /* e.g. "[:word" */ switch ((unsigned char) c) { case ':': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S29: /* e.g. "[:word:" */ switch ((unsigned char) c) { case ']': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S30: /* e.g. "[:word:]" */ - lx_native_ungetc(lx, c); return TOK_NAMED__CLASS; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; case S31: /* e.g. "[:pri" */ switch ((unsigned char) c) { case 'n': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S32: /* e.g. "[:up" */ switch ((unsigned char) c) { case 'p': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S33: /* e.g. "[:low" */ switch ((unsigned char) c) { case 'e': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S34: /* e.g. "[:lowe" */ switch ((unsigned char) c) { case 'r': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S35: /* e.g. "[:wo" */ switch ((unsigned char) c) { case 'r': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S36: /* e.g. "[:wor" */ switch ((unsigned char) c) { case 'd': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S37: /* e.g. "[:lo" */ switch ((unsigned char) c) { case 'w': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S38: /* e.g. "[:cn" */ switch ((unsigned char) c) { case 't': state = S39; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S39: /* e.g. "[:cnt" */ switch ((unsigned char) c) { case 'r': state = S40; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S40: /* e.g. "[:cntr" */ switch ((unsigned char) c) { case 'l': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S41: /* e.g. "[:as" */ switch ((unsigned char) c) { case 'c': state = S47; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -617,164 +717,200 @@ z1(struct lx_native_lx *lx) switch ((unsigned char) c) { case 'p': state = S43; break; case 'n': state = S44; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S43: /* e.g. "[:alp" */ switch ((unsigned char) c) { case 'h': state = S46; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S44: /* e.g. "[:aln" */ switch ((unsigned char) c) { case 'u': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S45: /* e.g. "[:alnu" */ switch ((unsigned char) c) { case 'm': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S46: /* e.g. "[:alph" */ switch ((unsigned char) c) { case 'a': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S47: /* e.g. "[:asc" */ switch ((unsigned char) c) { case 'i': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S48: /* e.g. "[:asci" */ switch ((unsigned char) c) { case 'i': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S49: /* e.g. "[:gr" */ switch ((unsigned char) c) { case 'a': state = S50; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S50: /* e.g. "[:gra" */ switch ((unsigned char) c) { case 'p': state = S51; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S51: /* e.g. "[:grap" */ switch ((unsigned char) c) { case 'h': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S52: /* e.g. "[:sp" */ switch ((unsigned char) c) { case 'a': state = S53; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S53: /* e.g. "[:spa" */ switch ((unsigned char) c) { case 'c': state = S54; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S54: /* e.g. "[:spac" */ switch ((unsigned char) c) { case 'e': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S55: /* e.g. "[:di" */ switch ((unsigned char) c) { case 'g': state = S56; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S56: /* e.g. "[:dig" */ switch ((unsigned char) c) { case 'i': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; case S4: return TOK_RANGE; - case S5: return TOK_CLOSEGROUP; - case S6: return TOK_CLOSEGROUPRANGE; + case S5: return lx->z = z2, TOK_CLOSEGROUP; + case S6: return lx->z = z2, TOK_CLOSEGROUPRANGE; case S7: return TOK_ESC; case S9: return TOK_OCT; case S11: return TOK_HEX; case S30: return TOK_NAMED__CLASS; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_native_token z2(struct lx_native_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; case '{': state = S3; break; @@ -820,63 +956,63 @@ z2(struct lx_native_lx *lx) case 'v': case '{': case '|': state = S19; break; - default: lx_native_ungetc(lx, c); return TOK_CHAR; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; } break; case S2: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_CHAR; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "{" */ - lx_native_ungetc(lx, c); return lx->z = z0, TOK_OPENCOUNT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z0, TOK_OPENCOUNT; case S4: /* e.g. "[" */ switch ((unsigned char) c) { case '^': state = S14; break; case ']': state = S15; break; - default: lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUP; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUP; } break; case S5: /* e.g. "|" */ - lx_native_ungetc(lx, c); return TOK_ALT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ALT; case S6: /* e.g. "." */ - lx_native_ungetc(lx, c); return TOK_ANY; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ANY; case S7: /* e.g. "+" */ - lx_native_ungetc(lx, c); return TOK_PLUS; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_PLUS; case S8: /* e.g. "*" */ - lx_native_ungetc(lx, c); return TOK_STAR; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_STAR; - case S9: /* e.g. "?" */ - lx_native_ungetc(lx, c); return TOK_OPT; + case S9: /* e.g. "\077" */ + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OPT; case S10: /* e.g. "$" */ - lx_native_ungetc(lx, c); return TOK_END; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_END; case S11: /* e.g. "^" */ - lx_native_ungetc(lx, c); return TOK_START; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_START; case S12: /* e.g. ")" */ - lx_native_ungetc(lx, c); return TOK_CLOSESUB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CLOSESUB; case S13: /* e.g. "(" */ - lx_native_ungetc(lx, c); return TOK_OPENSUB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OPENSUB; case S14: /* e.g. "[^" */ switch ((unsigned char) c) { case ']': state = S16; break; - default: lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUPINV; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUPINV; } break; case S15: /* e.g. "[]" */ - lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUPCB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUPCB; case S16: /* e.g. "[^]" */ - lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUPINVCB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUPINVCB; case S17: /* e.g. "\\x" */ switch ((unsigned char) c) { @@ -902,7 +1038,9 @@ z2(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -916,12 +1054,12 @@ z2(struct lx_native_lx *lx) case '5': case '6': case '7': state = S20; break; - default: lx_native_ungetc(lx, c); return TOK_OCT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; } break; case S19: /* e.g. "\\$" */ - lx_native_ungetc(lx, c); return TOK_ESC; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ESC; case S20: /* e.g. "\\00" */ switch ((unsigned char) c) { @@ -933,12 +1071,12 @@ z2(struct lx_native_lx *lx) case '5': case '6': case '7': state = S21; break; - default: lx_native_ungetc(lx, c); return TOK_OCT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; } break; case S21: /* e.g. "\\000" */ - lx_native_ungetc(lx, c); return TOK_OCT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; case S22: /* e.g. "\\x0" */ switch ((unsigned char) c) { @@ -964,32 +1102,26 @@ z2(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S23: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_HEX; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_HEX; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; - case S3: return TOK_OPENCOUNT; - case S4: return TOK_OPENGROUP; + case S3: return lx->z = z0, TOK_OPENCOUNT; + case S4: return lx->z = z1, TOK_OPENGROUP; case S5: return TOK_ALT; case S6: return TOK_ANY; case S7: return TOK_PLUS; @@ -999,16 +1131,31 @@ z2(struct lx_native_lx *lx) case S11: return TOK_START; case S12: return TOK_CLOSESUB; case S13: return TOK_OPENSUB; - case S14: return TOK_OPENGROUPINV; - case S15: return TOK_OPENGROUPCB; - case S16: return TOK_OPENGROUPINVCB; + case S14: return lx->z = z1, TOK_OPENGROUPINV; + case S15: return lx->z = z1, TOK_OPENGROUPCB; + case S16: return lx->z = z1, TOK_OPENGROUPINVCB; case S18: return TOK_OCT; case S19: return TOK_ESC; case S20: return TOK_OCT; case S21: return TOK_OCT; case S23: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -1164,6 +1311,7 @@ lx_native_init(struct lx_native_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_native_dynpop; } enum lx_native_token diff --git a/src/libre/dialect/native/parser.c b/src/libre/dialect/native/parser.c index 809383bf8..63c5f8cb8 100644 --- a/src/libre/dialect/native/parser.c +++ b/src/libre/dialect/native/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -326,7 +326,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -349,7 +349,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -385,7 +385,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -443,7 +443,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -499,7 +499,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags /* END OF INLINE: 141 */ /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -531,7 +531,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI216 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -545,7 +545,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI263)); mark(&act_state->countend, &(ZIend)); @@ -555,7 +555,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((*ZIm) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -586,7 +586,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -618,7 +618,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_CLOSECOUNT): /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI219 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -636,7 +636,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI263)); mark(&act_state->countend, &(ZIend)); @@ -646,7 +646,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -696,7 +696,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIclass), (ZInode))) { goto ZL1; @@ -709,7 +709,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; goto ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms; /* END OF INLINE: expr::character-class::list-of-class-terms */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -736,7 +736,7 @@ p_154(flags flags, lex_state lex_state, act_state act_state, err err) case (TOK_RANGE): /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI155 = '-'; ZI156 = lex_state->lx.start; @@ -760,7 +760,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-range */ { -#line 722 "src/libre/parser.act" +#line 718 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXRANGE; @@ -795,7 +795,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -815,7 +815,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; goto ZL2_expr_C_Clist_Hof_Hpieces; /* END OF INLINE: expr::list-of-pieces */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -849,7 +849,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -875,7 +875,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -914,7 +914,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -975,7 +975,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1031,7 +1031,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* END OF INLINE: 109 */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1064,7 +1064,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1096,7 +1096,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1141,7 +1141,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1208,7 +1208,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZI243 = DIALECT_CLASS(lex_state->buf.a); if (ZI243 == NULL) { @@ -1241,7 +1241,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1325,7 +1325,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -1349,7 +1349,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_NAMED; (ZIr).u.named.class = (ZIid); @@ -1389,7 +1389,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUP */ { -#line 319 "src/libre/parser.act" +#line 318 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI167 = lex_state->lx.end; @@ -1403,7 +1403,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1430,7 +1430,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPCB */ { -#line 335 "src/libre/parser.act" +#line 334 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI186 = lex_state->lx.end; @@ -1444,7 +1444,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1457,7 +1457,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -1471,7 +1471,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -1493,7 +1493,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINV */ { -#line 327 "src/libre/parser.act" +#line 326 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI178 = lex_state->lx.end; @@ -1507,7 +1507,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1520,7 +1520,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1577,7 +1577,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINVCB */ { -#line 343 "src/libre/parser.act" +#line 342 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI193 = lex_state->lx.end; @@ -1591,7 +1591,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1604,7 +1604,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1647,7 +1647,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-invert */ /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -1661,7 +1661,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -1693,7 +1693,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI200 = ']'; ZI201 = lex_state->lx.start; @@ -1709,7 +1709,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -1728,7 +1728,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUPRANGE */ { -#line 361 "src/libre/parser.act" +#line 360 "src/libre/parser.act" ZIcrange = '-'; ZI203 = lex_state->lx.start; @@ -1744,7 +1744,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIrange) = ast_make_expr_literal(act_state->poolp, *flags, (ZIcrange)); if ((ZIrange) == NULL) { @@ -1756,7 +1756,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZIrange))) { goto ZL4; @@ -1767,7 +1767,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -1785,7 +1785,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -1821,7 +1821,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZIrstart = lex_state->lx.start; @@ -1842,7 +1842,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode1) == NULL) { @@ -1865,7 +1865,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (ZIc); @@ -1875,7 +1875,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI183 = '-'; ZI184 = lex_state->lx.start; @@ -1896,7 +1896,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp } /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -1935,7 +1935,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF INLINE: 182 */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZItmp), (ZInode1))) { goto ZL1; @@ -1977,7 +1977,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI149 = lex_state->lx.start; @@ -1993,7 +1993,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -2050,7 +2050,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, } /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -2088,7 +2088,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2109,7 +2109,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2129,7 +2129,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -2141,7 +2141,7 @@ ZL1:; /* END OF ACTION: err-expected-alts */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2170,7 +2170,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (*ZIcbrak)); if ((ZInode1) == NULL) { @@ -2194,7 +2194,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (*ZIcbrak); @@ -2204,7 +2204,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI196 = '-'; ZI197 = lex_state->lx.start; @@ -2225,7 +2225,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs } /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZIcbrak); @@ -2235,7 +2235,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2294,7 +2294,7 @@ p_re__native(flags flags, lex_state lex_state, act_state act_state, err err, t_a /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -2308,7 +2308,7 @@ p_re__native(flags flags, lex_state lex_state, act_state act_state, err err, t_a } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -2334,7 +2334,7 @@ p_re__native(flags flags, lex_state lex_state, act_state act_state, err err, t_a { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -2422,7 +2422,7 @@ ZL2_expr_C_Clist_Hof_Halts:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIalts), (ZIa))) { goto ZL1; @@ -2441,7 +2441,7 @@ ZL2_expr_C_Clist_Hof_Halts:; goto ZL2_expr_C_Clist_Hof_Halts; /* END OF INLINE: expr::list-of-alts */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -2453,7 +2453,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -2485,7 +2485,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: OPENCOUNT */ { -#line 371 "src/libre/parser.act" +#line 370 "src/libre/parser.act" ZI263 = lex_state->lx.start; ZI264 = lex_state->lx.end; @@ -2501,7 +2501,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -2541,7 +2541,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-one */ { -#line 817 "src/libre/parser.act" +#line 816 "src/libre/parser.act" (ZIc) = ast_make_count(0, 1); @@ -2555,7 +2555,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-one-or-more */ { -#line 813 "src/libre/parser.act" +#line 812 "src/libre/parser.act" (ZIc) = ast_make_count(1, AST_COUNT_UNBOUNDED); @@ -2569,7 +2569,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -2582,7 +2582,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, { /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -2599,7 +2599,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-count */ { -#line 701 "src/libre/parser.act" +#line 697 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCOUNT; @@ -2611,7 +2611,7 @@ ZL1:; /* END OF ACTION: err-expected-count */ /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -2640,7 +2640,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -2650,7 +2650,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -2667,7 +2667,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-end */ { -#line 943 "src/libre/parser.act" +#line 942 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_END); if ((ZIe) == NULL) { @@ -2687,7 +2687,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -2701,7 +2701,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZIe) = ast_make_expr_group(act_state->poolp, *flags, (ZIg), (ZIid)); if ((ZIe) == NULL) { @@ -2725,7 +2725,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-start */ { -#line 936 "src/libre/parser.act" +#line 935 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_START); if ((ZIe) == NULL) { @@ -2765,7 +2765,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -2777,7 +2777,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -2806,7 +2806,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla { /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (*ZI243)); if ((ZInode) == NULL) { @@ -2826,7 +2826,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_NAMED; (ZIlower).u.named.class = (*ZI243); @@ -2842,7 +2842,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI244)); mark(&act_state->rangeend, &(ZIend)); @@ -2852,7 +2852,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2908,7 +2908,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZInode) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2942,7 +2942,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (*ZI247)); if ((ZInode) == NULL) { @@ -2962,7 +2962,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZI247); @@ -2978,7 +2978,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI248)); mark(&act_state->rangeend, &(ZIend)); @@ -2988,7 +2988,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -3035,7 +3035,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/native/parser.h b/src/libre/dialect/native/parser.h index e19648892..5cf04f6c6 100644 --- a/src/libre/dialect/native/parser.h +++ b/src/libre/dialect/native/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__native(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/native/parser.h" diff --git a/src/libre/dialect/pcre/lexer.c b/src/libre/dialect/pcre/lexer.c index b26096785..da8f825ee 100644 --- a/src/libre/dialect/pcre/lexer.c +++ b/src/libre/dialect/pcre/lexer.c @@ -17,11 +17,31 @@ static enum lx_pcre_token z5(struct lx_pcre_lx *lx); static enum lx_pcre_token z6(struct lx_pcre_lx *lx); static enum lx_pcre_token z7(struct lx_pcre_lx *lx); +static int +lx_pcre_advance_end(struct lx_pcre_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_pcre_lx *lx) +lx_pcre_getc(struct lx_pcre_lx *lx) { int c; @@ -37,18 +57,19 @@ lx_getc(struct lx_pcre_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_pcre_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_pcre_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_pcre_getc((struct lx_pcre_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -57,10 +78,7 @@ lx_pcre_ungetc(struct lx_pcre_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -112,6 +130,17 @@ lx_pcre_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_pcre_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_pcre_dynclear(void *buf_opaque) { @@ -151,32 +180,33 @@ lx_pcre_dynfree(void *buf_opaque) static enum lx_pcre_token z0(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S2; break; } break; @@ -184,19 +214,30 @@ z0(struct lx_pcre_lx *lx) case S1: /* e.g. "\\" */ switch ((unsigned char) c) { case 'E': state = S3; break; - default: lx_pcre_ungetc(lx, c); return TOK_CHAR; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S2: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S2: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_CHAR; + case S3: return lx->z = z7, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S3: @@ -211,45 +252,40 @@ z0(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_CHAR; - case S3: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z1(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '0': case '1': @@ -263,7 +299,9 @@ z1(struct lx_pcre_lx *lx) case '9': state = S1; break; case ',': state = S2; break; case '}': state = S3; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -279,67 +317,75 @@ z1(struct lx_pcre_lx *lx) case '7': case '8': case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_COUNT; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_COUNT; } break; case S2: /* e.g. "," */ - lx_pcre_ungetc(lx, c); return TOK_SEP; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_SEP; case S3: /* e.g. "}" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSECOUNT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSECOUNT; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_COUNT; + case S2: return TOK_SEP; + case S3: return lx->z = z7, TOK_CLOSECOUNT; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_COUNT; - case S2: return TOK_SEP; - case S3: return TOK_CLOSECOUNT; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z2(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S2; break; } break; @@ -347,19 +393,30 @@ z2(struct lx_pcre_lx *lx) case S1: /* e.g. "\\" */ switch ((unsigned char) c) { case 'E': state = S3; break; - default: lx_pcre_ungetc(lx, c); return TOK_CHAR; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S2: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S2: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z = z3, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_CHAR; + case S3: return lx->z = z3, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S3: @@ -374,24 +431,30 @@ z2(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_CHAR; - case S3: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z3(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; + assert(lx != NULL); + + if (lx->clear != NULL) { + lx->clear(lx->buf_opaque); + } + + lx->start = lx->end; + + void *getc_opaque = (void *)lx; enum { S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, @@ -400,32 +463,23 @@ z3(struct lx_pcre_lx *lx) S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63, S64, S65, S66, S67, S68, S69, - S70, S71, S72, NONE + S70, S71, S72 } state; - assert(lx != NULL); - - if (lx->clear != NULL) { - lx->clear(lx->buf_opaque); - } - - state = NONE; - - lx->start = lx->end; - - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; case '[': state = S2; break; case '-': state = S4; break; case ']': state = S5; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S3; break; } break; @@ -441,22 +495,8 @@ z3(struct lx_pcre_lx *lx) case 'h': case 's': case 'v': - case 'w': state = S24; break; - case 'Q': state = S55; break; - case 'E': state = S56; break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': state = S57; break; - case 'x': state = S58; break; - case '0': state = S59; break; - case 'o': state = S60; break; - case 'c': state = S61; break; + case 'w': state = S23; break; + case 'c': state = S55; break; case '$': case '(': case '*': @@ -475,384 +515,492 @@ z3(struct lx_pcre_lx *lx) case 'r': case 't': case '{': - case '|': state = S63; break; - default: state = S62; break; + case '|': state = S56; break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': state = S57; break; + case 'Q': state = S58; break; + case 'E': state = S59; break; + case 'o': state = S60; break; + case 'x': state = S62; break; + case '0': state = S63; break; + default: state = S61; break; } break; case S2: /* e.g. "[" */ switch ((unsigned char) c) { case ':': state = S7; break; - default: lx_pcre_ungetc(lx, c); return TOK_CHAR; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S3: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S3: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S4: /* e.g. "-" */ switch ((unsigned char) c) { case ']': state = S6; break; - default: lx_pcre_ungetc(lx, c); return TOK_RANGE; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_RANGE; } break; case S5: /* e.g. "]" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSEGROUP; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSEGROUP; case S6: /* e.g. "-]" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSEGROUPRANGE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSEGROUPRANGE; case S7: /* e.g. "[:" */ switch ((unsigned char) c) { case 'd': state = S8; break; - case 'u': state = S9; break; - case 'w': state = S10; break; - case 'x': state = S11; break; - case 'b': state = S12; break; - case 'c': state = S13; break; - case 'l': state = S14; break; + case 'p': state = S9; break; + case 'x': state = S10; break; + case 'c': state = S11; break; + case 'l': state = S12; break; + case 'g': state = S13; break; + case 's': state = S14; break; case 'a': state = S15; break; - case 's': state = S16; break; - case 'p': state = S17; break; - case 'g': state = S18; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'b': state = S16; break; + case 'u': state = S17; break; + case 'w': state = S18; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S8: /* e.g. "[:d" */ switch ((unsigned char) c) { case 'i': state = S53; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S9: /* e.g. "[:u" */ + case S9: /* e.g. "[:p" */ switch ((unsigned char) c) { - case 'p': state = S52; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S48; break; + case 'u': state = S49; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S10: /* e.g. "[:w" */ + case S10: /* e.g. "[:x" */ switch ((unsigned char) c) { - case 'o': state = S50; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'd': state = S8; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S11: /* e.g. "[:x" */ + case S11: /* e.g. "[:c" */ switch ((unsigned char) c) { - case 'd': state = S8; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S45; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S12: /* e.g. "[:b" */ + case S12: /* e.g. "[:l" */ switch ((unsigned char) c) { - case 'l': state = S47; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'o': state = S44; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S13: /* e.g. "[:c" */ + case S13: /* e.g. "[:g" */ switch ((unsigned char) c) { - case 'n': state = S44; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S41; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S14: /* e.g. "[:l" */ + case S14: /* e.g. "[:s" */ switch ((unsigned char) c) { - case 'o': state = S41; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S38; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:a" */ switch ((unsigned char) c) { - case 's': state = S33; break; - case 'l': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 's': state = S30; break; + case 'l': state = S31; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S16: /* e.g. "[:s" */ + case S16: /* e.g. "[:b" */ switch ((unsigned char) c) { - case 'p': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'l': state = S27; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S17: /* e.g. "[:p" */ + case S17: /* e.g. "[:u" */ switch ((unsigned char) c) { - case 'r': state = S25; break; - case 'u': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S24; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S18: /* e.g. "[:g" */ + case S18: /* e.g. "[:w" */ switch ((unsigned char) c) { - case 'r': state = S19; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'o': state = S19; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S19: /* e.g. "[:gr" */ + case S19: /* e.g. "[:wo" */ switch ((unsigned char) c) { - case 'a': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S20; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S20: /* e.g. "[:gra" */ + case S20: /* e.g. "[:wor" */ switch ((unsigned char) c) { - case 'p': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'd': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S21: /* e.g. "[:grap" */ + case S21: /* e.g. "[:word" */ switch ((unsigned char) c) { - case 'h': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case ':': state = S22; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S22: /* e.g. "[:word" */ + case S22: /* e.g. "[:word:" */ switch ((unsigned char) c) { - case ':': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case ']': state = S23; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S23: /* e.g. "[:word:" */ + case S23: /* e.g. "\\D" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; + + case S24: /* e.g. "[:up" */ switch ((unsigned char) c) { - case ']': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S25; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S24: /* e.g. "\\d" */ - lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; - - case S25: /* e.g. "[:pr" */ + case S25: /* e.g. "[:low" */ switch ((unsigned char) c) { - case 'i': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'e': state = S26; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S26: /* e.g. "[:pu" */ + case S26: /* e.g. "[:lowe" */ switch ((unsigned char) c) { - case 'n': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S27: /* e.g. "[:pun" */ + case S27: /* e.g. "[:bl" */ switch ((unsigned char) c) { - case 'c': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S28; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S28: /* e.g. "[:digi" */ + case S28: /* e.g. "[:bla" */ switch ((unsigned char) c) { - case 't': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S29; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S29: /* e.g. "[:pri" */ + case S29: /* e.g. "[:blan" */ switch ((unsigned char) c) { - case 'n': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'k': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S30: /* e.g. "[:sp" */ + case S30: /* e.g. "[:as" */ switch ((unsigned char) c) { - case 'a': state = S31; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'c': state = S36; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S31: /* e.g. "[:spa" */ + case S31: /* e.g. "[:al" */ switch ((unsigned char) c) { - case 'c': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S32; break; + case 'n': state = S33; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S32: /* e.g. "[:spac" */ + case S32: /* e.g. "[:alp" */ switch ((unsigned char) c) { - case 'e': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'h': state = S35; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S33: /* e.g. "[:as" */ + case S33: /* e.g. "[:aln" */ switch ((unsigned char) c) { - case 'c': state = S39; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'u': state = S34; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S34: /* e.g. "[:al" */ + case S34: /* e.g. "[:alnu" */ switch ((unsigned char) c) { - case 'n': state = S35; break; - case 'p': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'm': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S35: /* e.g. "[:aln" */ + case S35: /* e.g. "[:alph" */ switch ((unsigned char) c) { - case 'u': state = S38; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S36: /* e.g. "[:alp" */ + case S36: /* e.g. "[:asc" */ switch ((unsigned char) c) { - case 'h': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S37; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S37: /* e.g. "[:alph" */ + case S37: /* e.g. "[:asci" */ switch ((unsigned char) c) { - case 'a': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S38: /* e.g. "[:alnu" */ + case S38: /* e.g. "[:sp" */ switch ((unsigned char) c) { - case 'm': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S39; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S39: /* e.g. "[:asc" */ + case S39: /* e.g. "[:spa" */ switch ((unsigned char) c) { - case 'i': state = S40; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'c': state = S40; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S40: /* e.g. "[:asci" */ + case S40: /* e.g. "[:spac" */ switch ((unsigned char) c) { - case 'i': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'e': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S41: /* e.g. "[:lo" */ + case S41: /* e.g. "[:gr" */ switch ((unsigned char) c) { - case 'w': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S42; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S42: /* e.g. "[:low" */ + case S42: /* e.g. "[:gra" */ switch ((unsigned char) c) { - case 'e': state = S43; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S43; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S43: /* e.g. "[:lowe" */ + case S43: /* e.g. "[:grap" */ switch ((unsigned char) c) { - case 'r': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'h': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S44: /* e.g. "[:cn" */ + case S44: /* e.g. "[:lo" */ switch ((unsigned char) c) { - case 't': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'w': state = S25; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S45: /* e.g. "[:cnt" */ + case S45: /* e.g. "[:cn" */ switch ((unsigned char) c) { - case 'r': state = S46; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 't': state = S46; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S46: /* e.g. "[:cntr" */ + case S46: /* e.g. "[:cnt" */ switch ((unsigned char) c) { - case 'l': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S47; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S47: /* e.g. "[:bl" */ + case S47: /* e.g. "[:cntr" */ switch ((unsigned char) c) { - case 'a': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'l': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S48: /* e.g. "[:bla" */ + case S48: /* e.g. "[:pr" */ switch ((unsigned char) c) { - case 'n': state = S49; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S52; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S49: /* e.g. "[:blan" */ + case S49: /* e.g. "[:pu" */ switch ((unsigned char) c) { - case 'k': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S50; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S50: /* e.g. "[:wo" */ + case S50: /* e.g. "[:pun" */ switch ((unsigned char) c) { - case 'r': state = S51; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'c': state = S51; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S51: /* e.g. "[:wor" */ + case S51: /* e.g. "[:digi" */ switch ((unsigned char) c) { - case 'd': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 't': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S52: /* e.g. "[:up" */ + case S52: /* e.g. "[:pri" */ switch ((unsigned char) c) { - case 'p': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S51; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S53: /* e.g. "[:di" */ switch ((unsigned char) c) { case 'g': state = S54; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S54: /* e.g. "[:dig" */ switch ((unsigned char) c) { - case 'i': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S51; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S55: /* e.g. "\\Q" */ - lx_pcre_ungetc(lx, c); return lx->z = z2, lx->z(lx); + case S55: /* e.g. "\\c" */ + state = S72; break; - case S56: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z(lx); + case S56: /* e.g. "\\$" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ESC; case S57: /* e.g. "\\1" */ switch ((unsigned char) c) { @@ -866,13 +1014,29 @@ z3(struct lx_pcre_lx *lx) case '7': case '8': case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; - case S58: /* e.g. "\\x" */ + case S58: /* e.g. "\\Q" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z2, lx->z(lx); + + case S59: /* e.g. "\\E" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z(lx); + + case S60: /* e.g. "\\o" */ switch ((unsigned char) c) { - case '{': state = S69; break; + case '{': state = S70; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + } + break; + + case S61: /* e.g. "\\\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + + case S62: /* e.g. "\\x" */ + switch ((unsigned char) c) { + case '{': state = S66; break; case '0': case '1': case '2': @@ -894,12 +1058,12 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S70; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + case 'f': state = S67; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S59: /* e.g. "\\0" */ + case S63: /* e.g. "\\0" */ switch ((unsigned char) c) { case '0': case '1': @@ -908,31 +1072,12 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S68; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; + case '7': state = S64; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S60: /* e.g. "\\o" */ - switch ((unsigned char) c) { - case '{': state = S65; break; - default: lx_pcre_ungetc(lx, c); return TOK_NOESC; - } - break; - - case S61: /* e.g. "\\c" */ - state = S64; break; - - case S62: /* e.g. "\\g" */ - lx_pcre_ungetc(lx, c); return TOK_NOESC; - - case S63: /* e.g. "\\a" */ - lx_pcre_ungetc(lx, c); return TOK_ESC; - - case S64: /* e.g. "\\ca" */ - lx_pcre_ungetc(lx, c); return TOK_CONTROL; - - case S65: /* e.g. "\\o{" */ + case S64: /* e.g. "\\00" */ switch ((unsigned char) c) { case '0': case '1': @@ -941,44 +1086,15 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S66; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case '7': state = S65; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S66: /* e.g. "\\o{0" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': break; - case '}': state = S67; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; + case S65: /* e.g. "\\000" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; - case S67: /* e.g. "\\000" */ - lx_pcre_ungetc(lx, c); return TOK_OCT; - - case S68: /* e.g. "\\00" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': state = S67; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; - } - break; - - case S69: /* e.g. "\\x{" */ + case S66: /* e.g. "\\x{" */ switch ((unsigned char) c) { case '0': case '1': @@ -1001,12 +1117,14 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S72; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'f': state = S69; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S70: /* e.g. "\\xa" */ + case S67: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -1029,17 +1147,17 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S71; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + case 'f': state = S68; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S71: /* e.g. "\\xaa" */ - lx_pcre_ungetc(lx, c); return TOK_HEX; + case S68: /* e.g. "\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; - case S72: /* e.g. "\\x{a" */ + case S69: /* e.g. "\\x{0" */ switch ((unsigned char) c) { - case '}': state = S71; break; + case '}': state = S68; break; case '0': case '1': case '2': @@ -1062,17 +1180,84 @@ z3(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S70: /* e.g. "\\o{" */ + switch ((unsigned char) c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': state = S71; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; + case S71: /* e.g. "\\o{0" */ + switch ((unsigned char) c) { + case '}': state = S65; break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S72: /* e.g. "\\c\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CONTROL; + default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_INVALID; + case S2: return TOK_CHAR; + case S3: return TOK_CHAR; + case S4: return TOK_RANGE; + case S5: return lx->z = z7, TOK_CLOSEGROUP; + case S6: return lx->z = z7, TOK_CLOSEGROUPRANGE; + case S23: return TOK_NAMED__CLASS; + case S55: return TOK_NOESC; + case S56: return TOK_ESC; + case S57: return TOK_UNSUPPORTED; + case S58: return lx->z = z2, lx->z(lx); + case S59: return TOK_EOF; + case S60: return TOK_NOESC; + case S61: return TOK_NOESC; + case S62: return TOK_HEX; + case S63: return TOK_OCT; + case S64: return TOK_OCT; + case S65: return TOK_OCT; + case S67: return TOK_HEX; + case S68: return TOK_HEX; + case S72: return TOK_CONTROL; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { - case S55: - case S56: + case S58: + case S59: break; default: @@ -1084,64 +1269,41 @@ z3(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_INVALID; - case S2: return TOK_CHAR; - case S3: return TOK_CHAR; - case S4: return TOK_RANGE; - case S5: return TOK_CLOSEGROUP; - case S6: return TOK_CLOSEGROUPRANGE; - case S24: return TOK_NAMED__CLASS; - case S55: return TOK_EOF; - case S56: return TOK_EOF; - case S57: return TOK_UNSUPPORTED; - case S58: return TOK_HEX; - case S59: return TOK_OCT; - case S60: return TOK_NOESC; - case S61: return TOK_NOESC; - case S62: return TOK_NOESC; - case S63: return TOK_ESC; - case S64: return TOK_CONTROL; - case S67: return TOK_OCT; - case S68: return TOK_OCT; - case S70: return TOK_HEX; - case S71: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z4(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '+': case 'R': state = S1; break; @@ -1184,12 +1346,14 @@ z4(struct lx_pcre_lx *lx) case '-': state = S8; break; case ')': state = S9; break; case ':': state = S10; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S1: /* e.g. "R" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + case S1: /* e.g. "+" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; case S2: /* e.g. "0" */ switch ((unsigned char) c) { @@ -1203,53 +1367,45 @@ z4(struct lx_pcre_lx *lx) case '7': case '8': case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S3: /* e.g. "n" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__IGNORE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__IGNORE; case S4: /* e.g. "x" */ switch ((unsigned char) c) { case 'x': state = S7; break; - default: lx_pcre_ungetc(lx, c); return TOK_FLAG__EXTENDED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__EXTENDED; } break; case S5: /* e.g. "s" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__SINGLE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__SINGLE; case S6: /* e.g. "i" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__INSENSITIVE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__INSENSITIVE; case S7: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__UNKNOWN; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__UNKNOWN; case S8: /* e.g. "-" */ - lx_pcre_ungetc(lx, c); return TOK_NEGATE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NEGATE; case S9: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSE; case S10: /* e.g. ":" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_SUB; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_SUB; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_UNSUPPORTED; case S2: return TOK_UNSUPPORTED; case S3: return TOK_FLAG__IGNORE; @@ -1258,38 +1414,52 @@ z4(struct lx_pcre_lx *lx) case S6: return TOK_FLAG__INSENSITIVE; case S7: return TOK_FLAG__UNKNOWN; case S8: return TOK_NEGATE; - case S9: return TOK_CLOSE; - case S10: return TOK_SUB; - default: errno = EINVAL; return TOK_ERROR; + case S9: return lx->z = z7, TOK_CLOSE; + case S10: return lx->z = z7, TOK_SUB; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z5(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '(': state = S2; break; case ')': state = S3; break; @@ -1297,23 +1467,34 @@ z5(struct lx_pcre_lx *lx) } break; - case S1: /* e.g. "a" */ + case S1: /* e.g. "\\x00" */ switch ((unsigned char) c) { case '(': - case ')': lx_pcre_ungetc(lx, c); return lx->z(lx); + case ')': lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z(lx); default: break; } break; case S2: /* e.g. "(" */ - lx_pcre_ungetc(lx, c); return TOK_INVALID__COMMENT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_INVALID__COMMENT; case S3: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return TOK_INVALID__COMMENT; + case S3: return lx->z = z7, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S1: @@ -1329,24 +1510,30 @@ z5(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_INVALID__COMMENT; - case S3: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z6(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; + assert(lx != NULL); + + if (lx->clear != NULL) { + lx->clear(lx->buf_opaque); + } + + lx->start = lx->end; + + void *getc_opaque = (void *)lx; enum { S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, @@ -1355,26 +1542,15 @@ z6(struct lx_pcre_lx *lx) S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63, S64, S65, S66, S67, S68, S69, - S70, S71, S72, S73, S74, S75, S76, S77, S78, NONE + S70, S71, S72, S73, S74, S75, S76, S77, S78 } state; - assert(lx != NULL); - - if (lx->clear != NULL) { - lx->clear(lx->buf_opaque); - } - - state = NONE; - - lx->start = lx->end; - - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case ':': state = S1; break; case 'L': state = S2; break; @@ -1390,13 +1566,17 @@ z6(struct lx_pcre_lx *lx) case 'n': state = S12; break; case 'F': state = S13; break; case ')': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S1: /* e.g. ":" */ switch ((unsigned char) c) { - case ')': lx->lgetc = NULL; return TOK_UNKNOWN; + case ')': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S78; break; } break; @@ -1404,7 +1584,9 @@ z6(struct lx_pcre_lx *lx) case S2: /* e.g. "L" */ switch ((unsigned char) c) { case 'F': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1412,35 +1594,45 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'l': state = S18; break; case 'o': state = S76; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S4: /* e.g. "M" */ switch ((unsigned char) c) { case 'A': state = S74; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S5: /* e.g. "T" */ switch ((unsigned char) c) { case 'H': state = S72; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S6: /* e.g. "S" */ switch ((unsigned char) c) { case 'K': state = S70; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S7: /* e.g. "P" */ switch ((unsigned char) c) { case 'R': state = S67; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1448,21 +1640,27 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'C': state = S60; break; case 'N': state = S61; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S9: /* e.g. "a" */ switch ((unsigned char) c) { case 't': state = S56; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S10: /* e.g. "N" */ switch ((unsigned char) c) { case 'O': state = S46; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1470,7 +1668,9 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'R': state = S41; break; case 'O': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1478,7 +1678,9 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'l': state = S18; break; case 'e': state = S19; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1486,31 +1688,35 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case ':': state = S1; break; case 'A': state = S15; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S14: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, lx->z(lx); case S15: /* e.g. "FA" */ switch ((unsigned char) c) { case 'I': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S16: /* e.g. "FAI" */ switch ((unsigned char) c) { case 'L': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S17: /* e.g. "FAIL" */ switch ((unsigned char) c) { case ':': state = S1; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; @@ -1518,84 +1724,108 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'a': case 'b': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S19: /* e.g. "ne" */ switch ((unsigned char) c) { case 'g': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S20: /* e.g. "neg" */ switch ((unsigned char) c) { case 'a': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S21: /* e.g. "nega" */ switch ((unsigned char) c) { case 't': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S22: /* e.g. "negat" */ switch ((unsigned char) c) { case 'i': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S23: /* e.g. "negati" */ switch ((unsigned char) c) { case 'v': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S24: /* e.g. "negativ" */ switch ((unsigned char) c) { case 'e': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S25: /* e.g. "negative" */ switch ((unsigned char) c) { case '_': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S26: /* e.g. "negative_" */ switch ((unsigned char) c) { case 'l': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S27: /* e.g. "negative_l" */ switch ((unsigned char) c) { case 'o': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S28: /* e.g. "negative_lo" */ switch ((unsigned char) c) { case 'o': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S29: /* e.g. "negative_loo" */ switch ((unsigned char) c) { case 'k': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1603,338 +1833,428 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'b': state = S31; break; case 'a': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S31: /* e.g. "negative_lookb" */ switch ((unsigned char) c) { case 'e': state = S38; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S32: /* e.g. "negative_looka" */ switch ((unsigned char) c) { case 'h': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S33: /* e.g. "negative_lookah" */ switch ((unsigned char) c) { case 'e': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S34: /* e.g. "negative_lookahe" */ switch ((unsigned char) c) { case 'a': state = S35; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S35: /* e.g. "negative_lookahea" */ switch ((unsigned char) c) { case 'd': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S36: /* e.g. "nla" */ switch ((unsigned char) c) { case ':': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S37: /* e.g. "LF" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; case S38: /* e.g. "negative_lookbe" */ switch ((unsigned char) c) { case 'h': state = S39; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S39: /* e.g. "negative_lookbeh" */ switch ((unsigned char) c) { case 'i': state = S40; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S40: /* e.g. "negative_lookbehi" */ switch ((unsigned char) c) { case 'n': state = S35; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S41: /* e.g. "CR" */ switch ((unsigned char) c) { case 'L': state = S2; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S42: /* e.g. "CO" */ switch ((unsigned char) c) { case 'M': state = S43; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S43: /* e.g. "COM" */ switch ((unsigned char) c) { case 'M': state = S44; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S44: /* e.g. "COMM" */ switch ((unsigned char) c) { case 'I': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S45: /* e.g. "ACCEP" */ switch ((unsigned char) c) { case 'T': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S46: /* e.g. "NO" */ switch ((unsigned char) c) { case '_': state = S47; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S47: /* e.g. "NO_" */ switch ((unsigned char) c) { case 'S': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S48: /* e.g. "NO_S" */ switch ((unsigned char) c) { case 'T': state = S49; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S49: /* e.g. "NO_ST" */ switch ((unsigned char) c) { case 'A': state = S50; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S50: /* e.g. "NO_STA" */ switch ((unsigned char) c) { case 'R': state = S51; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S51: /* e.g. "NO_STAR" */ switch ((unsigned char) c) { case 'T': state = S52; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S52: /* e.g. "NO_START" */ switch ((unsigned char) c) { case '_': state = S53; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S53: /* e.g. "NO_START_" */ switch ((unsigned char) c) { case 'O': state = S54; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S54: /* e.g. "NO_START_O" */ switch ((unsigned char) c) { case 'P': state = S55; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S55: /* e.g. "NO_START_OP" */ switch ((unsigned char) c) { case 'T': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S56: /* e.g. "at" */ switch ((unsigned char) c) { case 'o': state = S57; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S57: /* e.g. "ato" */ switch ((unsigned char) c) { case 'm': state = S58; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S58: /* e.g. "atom" */ switch ((unsigned char) c) { case 'i': state = S59; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S59: /* e.g. "atomi" */ switch ((unsigned char) c) { case 'c': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S60: /* e.g. "AC" */ switch ((unsigned char) c) { case 'C': state = S65; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S61: /* e.g. "AN" */ switch ((unsigned char) c) { case 'Y': state = S62; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S62: /* e.g. "ANY" */ switch ((unsigned char) c) { case 'C': state = S63; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S63: /* e.g. "ANYC" */ switch ((unsigned char) c) { case 'R': state = S64; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S64: /* e.g. "ANYCR" */ switch ((unsigned char) c) { case 'L': state = S2; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S65: /* e.g. "ACC" */ switch ((unsigned char) c) { case 'E': state = S66; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S66: /* e.g. "ACCE" */ switch ((unsigned char) c) { case 'P': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S67: /* e.g. "PR" */ switch ((unsigned char) c) { case 'U': state = S68; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S68: /* e.g. "PRU" */ switch ((unsigned char) c) { case 'N': state = S69; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S69: /* e.g. "PRUN" */ switch ((unsigned char) c) { case 'E': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S70: /* e.g. "SK" */ switch ((unsigned char) c) { case 'I': state = S71; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S71: /* e.g. "SKI" */ switch ((unsigned char) c) { case 'P': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S72: /* e.g. "TH" */ switch ((unsigned char) c) { case 'E': state = S73; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S73: /* e.g. "THE" */ switch ((unsigned char) c) { case 'N': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S74: /* e.g. "MA" */ switch ((unsigned char) c) { case 'R': state = S75; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S75: /* e.g. "MAR" */ switch ((unsigned char) c) { case 'K': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S76: /* e.g. "po" */ switch ((unsigned char) c) { case 's': state = S77; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S77: /* e.g. "pos" */ switch ((unsigned char) c) { case 'i': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S78: /* e.g. ":a" */ + case S78: /* e.g. ":\\x00" */ switch ((unsigned char) c) { - case ')': lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + case ')': lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; default: break; } break; @@ -1942,6 +2262,21 @@ z6(struct lx_pcre_lx *lx) default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S13: return TOK_UNSUPPORTED; + case S14: return lx->z = z7, lx->z(lx); + case S17: return TOK_UNSUPPORTED; + case S37: return TOK_UNSUPPORTED; + case S41: return TOK_UNSUPPORTED; + case S62: return TOK_UNSUPPORTED; + case S78: return TOK_UNSUPPORTED; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S14: @@ -1956,53 +2291,44 @@ z6(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S13: return TOK_UNSUPPORTED; - case S14: return TOK_EOF; - case S17: return TOK_UNSUPPORTED; - case S37: return TOK_UNSUPPORTED; - case S41: return TOK_UNSUPPORTED; - case S62: return TOK_UNSUPPORTED; - case S78: return TOK_UNSUPPORTED; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z7(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, - S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, - S40, S41, S42, S43, S44, S45, S46, S47, S48, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, + S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, + S40, S41, S42, S43, S44, S45, S46, S47, S48 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S2; break; case '\n': @@ -2023,13 +2349,15 @@ z7(struct lx_pcre_lx *lx) case '$': state = S14; break; case '^': state = S15; break; case ')': state = S16; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "\\" */ switch ((unsigned char) c) { @@ -2042,24 +2370,7 @@ z7(struct lx_pcre_lx *lx) case 'X': case 'b': case 'g': - case 'k': state = S20; break; - case 'Q': state = S28; break; - case 'o': state = S29; break; - case 'c': state = S30; break; - case 'x': state = S32; break; - case '0': state = S33; break; - case 'R': state = S34; break; - case 'D': - case 'H': - case 'N': - case 'S': - case 'V': - case 'W': - case 'd': - case 'h': - case 's': - case 'v': - case 'w': state = S35; break; + case 'k': state = S21; break; case '$': case '(': case ')': @@ -2077,8 +2388,9 @@ z7(struct lx_pcre_lx *lx) case 'r': case 't': case '{': - case '|': state = S36; break; - case 'E': state = S37; break; + case '|': state = S28; break; + case 'E': state = S29; break; + case 'z': state = S30; break; case '1': case '2': case '3': @@ -2087,29 +2399,45 @@ z7(struct lx_pcre_lx *lx) case '6': case '7': case '8': - case '9': state = S38; break; - case 'z': state = S39; break; - default: state = S31; break; + case '9': state = S31; break; + case 'Q': state = S32; break; + case 'c': state = S33; break; + case 'o': state = S34; break; + case 'x': state = S36; break; + case 'D': + case 'H': + case 'N': + case 'S': + case 'V': + case 'W': + case 'd': + case 'h': + case 's': + case 'v': + case 'w': state = S37; break; + case 'R': state = S38; break; + case '0': state = S39; break; + default: state = S35; break; } break; case S3: /* e.g. "\\x0a" */ - lx_pcre_ungetc(lx, c); return TOK_NEWLINE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NEWLINE; case S4: /* e.g. "\\x09" */ - lx_pcre_ungetc(lx, c); return TOK_WHITESPACE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_WHITESPACE; case S5: /* e.g. "#" */ - lx_pcre_ungetc(lx, c); return TOK_MAYBE_COMMENT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_MAYBE_COMMENT; case S6: /* e.g. "{" */ - lx_pcre_ungetc(lx, c); return lx->z = z1, TOK_OPENCOUNT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENCOUNT; case S7: /* e.g. "[" */ switch ((unsigned char) c) { case '^': state = S25; break; case ']': state = S26; break; - default: lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUP; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUP; } break; @@ -2117,66 +2445,56 @@ z7(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case '?': state = S17; break; case '*': state = S18; break; - default: lx_pcre_ungetc(lx, c); return TOK_OPEN; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OPEN; } break; case S9: /* e.g. "|" */ - lx_pcre_ungetc(lx, c); return TOK_ALT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ALT; case S10: /* e.g. "." */ - lx_pcre_ungetc(lx, c); return TOK_ANY; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ANY; case S11: /* e.g. "+" */ - lx_pcre_ungetc(lx, c); return TOK_PLUS; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_PLUS; case S12: /* e.g. "*" */ - lx_pcre_ungetc(lx, c); return TOK_STAR; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_STAR; - case S13: /* e.g. "?" */ - lx_pcre_ungetc(lx, c); return TOK_OPT; + case S13: /* e.g. "\077" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OPT; case S14: /* e.g. "$" */ - lx_pcre_ungetc(lx, c); return TOK_END__NL; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_END__NL; case S15: /* e.g. "^" */ - lx_pcre_ungetc(lx, c); return TOK_START; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_START; case S16: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return TOK_CLOSE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CLOSE; - case S17: /* e.g. "(?" */ + case S17: /* e.g. "(\077" */ switch ((unsigned char) c) { case '#': state = S19; break; + case '<': state = S20; break; case '!': case '&': - case '=': state = S20; break; - case 'P': state = S21; break; - case '<': state = S22; break; - default: lx_pcre_ungetc(lx, c); return lx->z = z4, TOK_FLAGS; + case '=': state = S21; break; + case 'P': state = S22; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z4, TOK_FLAGS; } break; case S18: /* e.g. "(*" */ - lx_pcre_ungetc(lx, c); return lx->z = z6, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z6, lx->z(lx); - case S19: /* e.g. "(?#" */ - lx_pcre_ungetc(lx, c); return lx->z = z5, lx->z(lx); + case S19: /* e.g. "(\077#" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z5, lx->z(lx); - case S20: /* e.g. "\\b" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; - - case S21: /* e.g. "(?P" */ - switch ((unsigned char) c) { - case '>': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; - - case S22: /* e.g. "(?<" */ + case S20: /* e.g. "(\077<" */ switch ((unsigned char) c) { case '!': - case '=': state = S20; break; + case '=': state = S21; break; case 'A': case 'B': case 'C': @@ -2230,11 +2548,25 @@ z7(struct lx_pcre_lx *lx) case 'x': case 'y': case 'z': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S21: /* e.g. "\\B" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; + + case S22: /* e.g. "(\077P" */ + switch ((unsigned char) c) { + case '>': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S23: /* e.g. "(?': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S24: /* e.g. "(?" */ - lx_pcre_ungetc(lx, c); return TOK_OPENCAPTURE; + case S24: /* e.g. "(\077" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OPENCAPTURE; case S25: /* e.g. "[^" */ switch ((unsigned char) c) { case ']': state = S27; break; - default: lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPINV; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUPINV; } break; case S26: /* e.g. "[]" */ - lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPCB; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUPCB; case S27: /* e.g. "[^]" */ - lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPINVCB; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUPINVCB; - case S28: /* e.g. "\\Q" */ - lx_pcre_ungetc(lx, c); return lx->z = z0, lx->z(lx); + case S28: /* e.g. "\\$" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ESC; - case S29: /* e.g. "\\o" */ + case S29: /* e.g. "\\E" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z(lx); + + case S30: /* e.g. "\\z" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_END; + + case S31: /* e.g. "\\1" */ switch ((unsigned char) c) { - case '{': state = S47; break; - default: lx_pcre_ungetc(lx, c); return TOK_NOESC; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; - case S30: /* e.g. "\\c" */ - state = S46; break; + case S32: /* e.g. "\\Q" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z0, lx->z(lx); - case S31: /* e.g. "\\i" */ - lx_pcre_ungetc(lx, c); return TOK_NOESC; + case S33: /* e.g. "\\c" */ + state = S48; break; - case S32: /* e.g. "\\x" */ + case S34: /* e.g. "\\o" */ + switch ((unsigned char) c) { + case '{': state = S46; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + } + break; + + case S35: /* e.g. "\\\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + + case S36: /* e.g. "\\x" */ switch ((unsigned char) c) { case '{': state = S42; break; case '0': @@ -2361,37 +2720,17 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': state = S43; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S33: /* e.g. "\\0" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': state = S40; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; - } - break; - - case S34: /* e.g. "\\R" */ - lx_pcre_ungetc(lx, c); return TOK_EOL; - - case S35: /* e.g. "\\d" */ - lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; - - case S36: /* e.g. "\\a" */ - lx_pcre_ungetc(lx, c); return TOK_ESC; + case S37: /* e.g. "\\D" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; - case S37: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z(lx); + case S38: /* e.g. "\\R" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_EOL; - case S38: /* e.g. "\\1" */ + case S39: /* e.g. "\\0" */ switch ((unsigned char) c) { case '0': case '1': @@ -2400,16 +2739,11 @@ z7(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': - case '8': - case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + case '7': state = S40; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S39: /* e.g. "\\z" */ - lx_pcre_ungetc(lx, c); return TOK_END; - case S40: /* e.g. "\\00" */ switch ((unsigned char) c) { case '0': @@ -2420,12 +2754,12 @@ z7(struct lx_pcre_lx *lx) case '5': case '6': case '7': state = S41; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; case S41: /* e.g. "\\000" */ - lx_pcre_ungetc(lx, c); return TOK_OCT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; case S42: /* e.g. "\\x{" */ switch ((unsigned char) c) { @@ -2451,11 +2785,13 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S43: /* e.g. "\\xa" */ + case S43: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -2479,14 +2815,14 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': state = S44; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S44: /* e.g. "\\xaa" */ - lx_pcre_ungetc(lx, c); return TOK_HEX; + case S44: /* e.g. "\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; - case S45: /* e.g. "\\x{a" */ + case S45: /* e.g. "\\x{0" */ switch ((unsigned char) c) { case '}': state = S44; break; case '0': @@ -2511,14 +2847,13 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S46: /* e.g. "\\ca" */ - lx_pcre_ungetc(lx, c); return TOK_CONTROL; - - case S47: /* e.g. "\\o{" */ + case S46: /* e.g. "\\o{" */ switch ((unsigned char) c) { case '0': case '1': @@ -2527,12 +2862,14 @@ z7(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case '7': state = S47; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S48: /* e.g. "\\o{0" */ + case S47: /* e.g. "\\o{0" */ switch ((unsigned char) c) { case '}': state = S41; break; case '0': @@ -2543,43 +2880,29 @@ z7(struct lx_pcre_lx *lx) case '5': case '6': case '7': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - default: - ; /* unreached */ - } - - switch (state) { - case S18: - case S19: - case S28: - case S37: - break; + case S48: /* e.g. "\\c\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CONTROL; default: - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } - break; - + ; /* unreached */ } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_INVALID; case S3: return TOK_NEWLINE; case S4: return TOK_WHITESPACE; case S5: return TOK_MAYBE_COMMENT; - case S6: return TOK_OPENCOUNT; - case S7: return TOK_OPENGROUP; + case S6: return lx->z = z1, TOK_OPENCOUNT; + case S7: return lx->z = z3, TOK_OPENGROUP; case S8: return TOK_OPEN; case S9: return TOK_ALT; case S10: return TOK_ANY; @@ -2589,33 +2912,59 @@ z7(struct lx_pcre_lx *lx) case S14: return TOK_END__NL; case S15: return TOK_START; case S16: return TOK_CLOSE; - case S17: return TOK_FLAGS; - case S18: return TOK_EOF; - case S19: return TOK_EOF; - case S20: return TOK_UNSUPPORTED; + case S17: return lx->z = z4, TOK_FLAGS; + case S18: return lx->z = z6, lx->z(lx); + case S19: return lx->z = z5, lx->z(lx); + case S21: return TOK_UNSUPPORTED; case S24: return TOK_OPENCAPTURE; - case S25: return TOK_OPENGROUPINV; - case S26: return TOK_OPENGROUPCB; - case S27: return TOK_OPENGROUPINVCB; - case S28: return TOK_EOF; - case S29: return TOK_NOESC; - case S30: return TOK_NOESC; - case S31: return TOK_NOESC; - case S32: return TOK_HEX; - case S33: return TOK_OCT; - case S34: return TOK_EOL; - case S35: return TOK_NAMED__CLASS; - case S36: return TOK_ESC; - case S37: return TOK_EOF; - case S38: return TOK_UNSUPPORTED; - case S39: return TOK_END; + case S25: return lx->z = z3, TOK_OPENGROUPINV; + case S26: return lx->z = z3, TOK_OPENGROUPCB; + case S27: return lx->z = z3, TOK_OPENGROUPINVCB; + case S28: return TOK_ESC; + case S29: return TOK_EOF; + case S30: return TOK_END; + case S31: return TOK_UNSUPPORTED; + case S32: return lx->z = z0, lx->z(lx); + case S33: return TOK_NOESC; + case S34: return TOK_NOESC; + case S35: return TOK_NOESC; + case S36: return TOK_HEX; + case S37: return TOK_NAMED__CLASS; + case S38: return TOK_EOL; + case S39: return TOK_OCT; case S40: return TOK_OCT; case S41: return TOK_OCT; case S43: return TOK_HEX; case S44: return TOK_HEX; - case S46: return TOK_CONTROL; - default: errno = EINVAL; return TOK_ERROR; + case S48: return TOK_CONTROL; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + switch (state) { + case S18: + case S19: + case S29: + case S32: + break; + + default: + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + break; + + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -3092,6 +3441,7 @@ lx_pcre_init(struct lx_pcre_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_pcre_dynpop; } enum lx_pcre_token diff --git a/src/libre/dialect/pcre/parser.c b/src/libre/dialect/pcre/parser.c index 78c11d795..e59fee020 100644 --- a/src/libre/dialect/pcre/parser.c +++ b/src/libre/dialect/pcre/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -325,7 +325,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta /* BEGINNING OF EXTRACT: FLAG_EXTENDED */ { -#line 666 "src/libre/parser.act" +#line 665 "src/libre/parser.act" ZIc = RE_EXTENDED; @@ -335,7 +335,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-union */ { -#line 801 "src/libre/parser.act" +#line 800 "src/libre/parser.act" (ZIo) = (ZIi) | (ZIc); @@ -356,7 +356,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta /* BEGINNING OF EXTRACT: FLAG_INSENSITIVE */ { -#line 662 "src/libre/parser.act" +#line 661 "src/libre/parser.act" ZIc = RE_ICASE; @@ -366,7 +366,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-union */ { -#line 801 "src/libre/parser.act" +#line 800 "src/libre/parser.act" (ZIo) = (ZIi) | (ZIc); @@ -381,7 +381,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta /* BEGINNING OF EXTRACT: FLAG_SINGLE */ { -#line 670 "src/libre/parser.act" +#line 669 "src/libre/parser.act" ZIc = RE_SINGLE; @@ -391,7 +391,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-union */ { -#line 801 "src/libre/parser.act" +#line 800 "src/libre/parser.act" (ZIo) = (ZIi) | (ZIc); @@ -406,7 +406,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ZIo = ZIi; /* BEGINNING OF ACTION: err-unknown-flag */ { -#line 743 "src/libre/parser.act" +#line 739 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EFLAG; @@ -451,7 +451,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -474,7 +474,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: CONTROL */ { -#line 448 "src/libre/parser.act" +#line 442 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] == 'c'); @@ -522,7 +522,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -538,7 +538,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -574,7 +574,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -632,7 +632,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -685,7 +685,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: UNSUPPORTED */ { -#line 429 "src/libre/parser.act" +#line 426 "src/libre/parser.act" /* handle \1-\9 back references */ if (lex_state->buf.a[0] == '\\' && lex_state->buf.a[1] != '\0' && lex_state->buf.a[2] == '\0') { @@ -707,7 +707,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -726,7 +726,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags /* END OF INLINE: 155 */ /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -762,7 +762,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIclass), (ZInode))) { goto ZL1; @@ -775,7 +775,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; goto ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms; /* END OF INLINE: expr::character-class::list-of-class-terms */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -822,7 +822,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -851,7 +851,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; goto ZL2_expr_C_Clist_Hof_Hpieces; /* END OF INLINE: expr::list-of-pieces */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -874,7 +874,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla { /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (*ZI290)); if ((ZInode) == NULL) { @@ -894,7 +894,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_NAMED; (ZIlower).u.named.class = (*ZI290); @@ -910,7 +910,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI291)); mark(&act_state->rangeend, &(ZIend)); @@ -920,7 +920,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -980,7 +980,7 @@ p_168(flags flags, lex_state lex_state, act_state act_state, err err) case (TOK_RANGE): /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI169 = '-'; ZI170 = lex_state->lx.start; @@ -1004,7 +1004,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-range */ { -#line 722 "src/libre/parser.act" +#line 718 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXRANGE; @@ -1043,7 +1043,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1069,7 +1069,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: CONTROL */ { -#line 448 "src/libre/parser.act" +#line 442 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] == 'c'); @@ -1124,7 +1124,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1163,7 +1163,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1224,7 +1224,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: NOESC */ { -#line 417 "src/libre/parser.act" +#line 412 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1251,7 +1251,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1307,7 +1307,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: UNSUPPORTED */ { -#line 429 "src/libre/parser.act" +#line 426 "src/libre/parser.act" /* handle \1-\9 back references */ if (lex_state->buf.a[0] == '\\' && lex_state->buf.a[1] != '\0' && lex_state->buf.a[2] == '\0') { @@ -1329,7 +1329,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -1348,7 +1348,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* END OF INLINE: 111 */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1381,7 +1381,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1413,7 +1413,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CONTROL */ { -#line 448 "src/libre/parser.act" +#line 442 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] == 'c'); @@ -1461,7 +1461,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -1486,7 +1486,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1531,7 +1531,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1598,7 +1598,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZI290 = DIALECT_CLASS(lex_state->buf.a); if (ZI290 == NULL) { @@ -1631,7 +1631,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: NOESC */ { -#line 417 "src/libre/parser.act" +#line 412 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1651,7 +1651,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1671,7 +1671,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1733,7 +1733,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: UNSUPPORTED */ { -#line 429 "src/libre/parser.act" +#line 426 "src/libre/parser.act" /* handle \1-\9 back references */ if (lex_state->buf.a[0] == '\\' && lex_state->buf.a[1] != '\0' && lex_state->buf.a[2] == '\0') { @@ -1755,7 +1755,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -1801,7 +1801,7 @@ p_expr_C_Ccomment(flags flags, lex_state lex_state, act_state act_state, err err ADVANCE_LEXER; /* BEGINNING OF ACTION: err-invalid-comment */ { -#line 687 "src/libre/parser.act" +#line 683 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EBADCOMMENT; @@ -1835,7 +1835,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -1859,7 +1859,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_NAMED; (ZIr).u.named.class = (ZIid); @@ -1899,7 +1899,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUP */ { -#line 319 "src/libre/parser.act" +#line 318 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI181 = lex_state->lx.end; @@ -1913,7 +1913,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1940,7 +1940,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPCB */ { -#line 335 "src/libre/parser.act" +#line 334 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI200 = lex_state->lx.end; @@ -1954,7 +1954,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1967,7 +1967,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -1981,7 +1981,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -2003,7 +2003,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINV */ { -#line 327 "src/libre/parser.act" +#line 326 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI192 = lex_state->lx.end; @@ -2017,7 +2017,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2030,7 +2030,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -2087,7 +2087,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINVCB */ { -#line 343 "src/libre/parser.act" +#line 342 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI207 = lex_state->lx.end; @@ -2101,7 +2101,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2114,7 +2114,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -2157,7 +2157,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-invert */ /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -2171,7 +2171,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -2203,7 +2203,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI214 = ']'; ZI215 = lex_state->lx.start; @@ -2219,7 +2219,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -2238,7 +2238,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUPRANGE */ { -#line 361 "src/libre/parser.act" +#line 360 "src/libre/parser.act" ZIcrange = '-'; ZI217 = lex_state->lx.start; @@ -2254,7 +2254,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIrange) = ast_make_expr_literal(act_state->poolp, *flags, (ZIcrange)); if ((ZIrange) == NULL) { @@ -2266,7 +2266,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZIrange))) { goto ZL4; @@ -2277,7 +2277,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -2295,7 +2295,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -2338,7 +2338,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI163 = lex_state->lx.start; @@ -2354,7 +2354,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -2401,7 +2401,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (*ZI314)); if ((ZInode) == NULL) { @@ -2421,7 +2421,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZI314); @@ -2437,7 +2437,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI315)); mark(&act_state->rangeend, &(ZIend)); @@ -2447,7 +2447,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2522,7 +2522,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -2531,7 +2531,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: count-one */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -2578,7 +2578,7 @@ p_320(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI256 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -2592,7 +2592,7 @@ p_320(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI318)); mark(&act_state->countend, &(ZIend)); @@ -2602,7 +2602,7 @@ p_320(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((*ZIm) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -2656,7 +2656,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2677,7 +2677,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -2689,7 +2689,7 @@ ZL1:; /* END OF ACTION: err-expected-alts */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2722,7 +2722,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI261 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -2736,7 +2736,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI318)); mark(&act_state->countend, &(ZIend)); @@ -2746,7 +2746,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-unbounded */ { -#line 805 "src/libre/parser.act" +#line 804 "src/libre/parser.act" (ZIn) = AST_COUNT_UNBOUNDED; @@ -2755,7 +2755,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: count-unbounded */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -2783,7 +2783,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -2811,7 +2811,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 case (TOK_CLOSECOUNT): /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI259 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -2829,7 +2829,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI318)); mark(&act_state->countend, &(ZIend)); @@ -2839,7 +2839,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -2885,7 +2885,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZIrstart = lex_state->lx.start; @@ -2906,7 +2906,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode1) == NULL) { @@ -2929,7 +2929,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (ZIc); @@ -2939,7 +2939,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI197 = '-'; ZI198 = lex_state->lx.start; @@ -2960,7 +2960,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp } /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2999,7 +2999,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF INLINE: 196 */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZItmp), (ZInode1))) { goto ZL1; @@ -3044,7 +3044,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-none */ { -#line 797 "src/libre/parser.act" +#line 796 "src/libre/parser.act" (ZIempty__pos) = RE_FLAGS_NONE; @@ -3053,7 +3053,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: re-flag-none */ /* BEGINNING OF ACTION: re-flag-none */ { -#line 797 "src/libre/parser.act" +#line 796 "src/libre/parser.act" (ZIempty__neg) = RE_FLAGS_NONE; @@ -3110,7 +3110,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-mask-re-flags */ { -#line 931 "src/libre/parser.act" +#line 926 "src/libre/parser.act" /* * Note: in cases like `(?i-i)`, the negative is @@ -3124,7 +3124,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: ast-mask-re-flags */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -3144,7 +3144,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-get-re-flags */ { -#line 919 "src/libre/parser.act" +#line 918 "src/libre/parser.act" (ZIflags) = *flags; @@ -3153,7 +3153,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: ast-get-re-flags */ /* BEGINNING OF ACTION: ast-mask-re-flags */ { -#line 931 "src/libre/parser.act" +#line 926 "src/libre/parser.act" /* * Note: in cases like `(?i-i)`, the negative is @@ -3172,7 +3172,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, } /* BEGINNING OF ACTION: ast-set-re-flags */ { -#line 923 "src/libre/parser.act" +#line 922 "src/libre/parser.act" *flags = (ZIflags); @@ -3197,7 +3197,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, { /* BEGINNING OF ACTION: err-expected-closeflags */ { -#line 750 "src/libre/parser.act" +#line 746 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEFLAGS; @@ -3209,7 +3209,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: err-expected-closeflags */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -3250,7 +3250,7 @@ p_expr_C_Cpiece_C_Clist_Hof_Hcounts(flags flags, lex_state lex_state, act_state } /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -3275,7 +3275,7 @@ p_expr_C_Cpiece_C_Clist_Hof_Hcounts(flags flags, lex_state lex_state, act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -3292,7 +3292,7 @@ p_expr_C_Cpiece_C_Clist_Hof_Hcounts(flags flags, lex_state lex_state, act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -3384,7 +3384,7 @@ p_class_Hnamed(flags flags, lex_state lex_state, act_state act_state, err err, t case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -3408,7 +3408,7 @@ p_class_Hnamed(flags flags, lex_state lex_state, act_state act_state, err err, t ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (ZIid)); if ((ZInode) == NULL) { @@ -3439,7 +3439,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (*ZIcbrak)); if ((ZInode1) == NULL) { @@ -3463,7 +3463,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (*ZIcbrak); @@ -3473,7 +3473,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI210 = '-'; ZI211 = lex_state->lx.start; @@ -3494,7 +3494,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs } /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZIcbrak); @@ -3504,7 +3504,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -3566,7 +3566,7 @@ ZL2_expr_C_Clist_Hof_Halts:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIalts), (ZIa))) { goto ZL1; @@ -3585,7 +3585,7 @@ ZL2_expr_C_Clist_Hof_Halts:; goto ZL2_expr_C_Clist_Hof_Halts; /* END OF INLINE: expr::list-of-alts */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -3597,7 +3597,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -3629,7 +3629,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: OPENCOUNT */ { -#line 371 "src/libre/parser.act" +#line 370 "src/libre/parser.act" ZI318 = lex_state->lx.start; ZI319 = lex_state->lx.end; @@ -3645,7 +3645,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -3685,7 +3685,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-one */ { -#line 817 "src/libre/parser.act" +#line 816 "src/libre/parser.act" (ZIc) = ast_make_count(0, 1); @@ -3699,7 +3699,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-one-or-more */ { -#line 813 "src/libre/parser.act" +#line 812 "src/libre/parser.act" (ZIc) = ast_make_count(1, AST_COUNT_UNBOUNDED); @@ -3713,7 +3713,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -3732,7 +3732,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-count */ { -#line 701 "src/libre/parser.act" +#line 697 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCOUNT; @@ -3744,7 +3744,7 @@ ZL1:; /* END OF ACTION: err-expected-count */ /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -3774,7 +3774,7 @@ p_re__pcre(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -3788,7 +3788,7 @@ p_re__pcre(flags flags, lex_state lex_state, act_state act_state, err err, t_ast } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -3814,7 +3814,7 @@ p_re__pcre(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -3850,7 +3850,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -3860,7 +3860,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -3877,7 +3877,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-end */ { -#line 943 "src/libre/parser.act" +#line 942 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_END); if ((ZIe) == NULL) { @@ -3894,7 +3894,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-end-nl */ { -#line 950 "src/libre/parser.act" +#line 949 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_END); if ((ZIe) == NULL) { @@ -3922,7 +3922,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-bsr */ { -#line 789 "src/libre/parser.act" +#line 787 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIclass__bsr) = &class_bsr; @@ -3932,7 +3932,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-bsr */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIbsr) = ast_make_expr_named(act_state->poolp, *flags, (ZIclass__bsr)); if ((ZIbsr) == NULL) { @@ -3944,7 +3944,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIcrlf) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIcrlf) == NULL) { @@ -3956,7 +3956,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-concat */ /* BEGINNING OF ACTION: make-literal-cr */ { -#line 890 "src/libre/parser.act" +#line 889 "src/libre/parser.act" (ZIcr) = '\r'; @@ -3965,7 +3965,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: make-literal-cr */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIecr) = ast_make_expr_literal(act_state->poolp, *flags, (ZIcr)); if ((ZIecr) == NULL) { @@ -3977,7 +3977,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: make-literal-nl */ { -#line 894 "src/libre/parser.act" +#line 893 "src/libre/parser.act" (ZInl) = '\n'; @@ -3986,7 +3986,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: make-literal-nl */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIenl) = ast_make_expr_literal(act_state->poolp, *flags, (ZInl)); if ((ZIenl) == NULL) { @@ -3998,7 +3998,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcrlf), (ZIecr))) { goto ZL1; @@ -4009,7 +4009,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-add-concat */ /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcrlf), (ZIenl))) { goto ZL1; @@ -4020,7 +4020,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-add-concat */ /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZIe) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -4032,7 +4032,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-alt */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIe), (ZIcrlf))) { goto ZL1; @@ -4043,7 +4043,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIe), (ZIbsr))) { goto ZL1; @@ -4063,7 +4063,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-get-re-flags */ { -#line 919 "src/libre/parser.act" +#line 918 "src/libre/parser.act" (ZIflags) = *flags; @@ -4072,7 +4072,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-get-re-flags */ /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -4086,7 +4086,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e } /* BEGINNING OF ACTION: ast-set-re-flags */ { -#line 923 "src/libre/parser.act" +#line 922 "src/libre/parser.act" *flags = (ZIflags); @@ -4095,7 +4095,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-set-re-flags */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZIe) = ast_make_expr_group(act_state->poolp, *flags, (ZIg), (ZIid)); if ((ZIe) == NULL) { @@ -4119,7 +4119,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-start */ { -#line 936 "src/libre/parser.act" +#line 935 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_START); if ((ZIe) == NULL) { @@ -4178,7 +4178,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -4190,7 +4190,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -4223,7 +4223,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZInode) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -4244,7 +4244,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -4287,7 +4287,7 @@ p_expr_C_Ctype(flags flags, lex_state lex_state, act_state act_state, err err, t } /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -4299,7 +4299,7 @@ p_expr_C_Ctype(flags flags, lex_state lex_state, act_state act_state, err err, t /* END OF ACTION: ast-make-alt */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZInode), (ZIclass))) { goto ZL1; @@ -4319,7 +4319,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/pcre/parser.h b/src/libre/dialect/pcre/parser.h index c0cfbabe3..84ef34223 100644 --- a/src/libre/dialect/pcre/parser.h +++ b/src/libre/dialect/pcre/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__pcre(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/pcre/parser.h" diff --git a/src/libre/dialect/sql/lexer.c b/src/libre/dialect/sql/lexer.c index 6c35bf800..87459e786 100644 --- a/src/libre/dialect/sql/lexer.c +++ b/src/libre/dialect/sql/lexer.c @@ -12,11 +12,31 @@ static enum lx_sql_token z0(struct lx_sql_lx *lx); static enum lx_sql_token z1(struct lx_sql_lx *lx); static enum lx_sql_token z2(struct lx_sql_lx *lx); +static int +lx_sql_advance_end(struct lx_sql_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_sql_lx *lx) +lx_sql_getc(struct lx_sql_lx *lx) { int c; @@ -32,18 +52,19 @@ lx_getc(struct lx_sql_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_sql_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_sql_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_sql_getc((struct lx_sql_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -52,10 +73,7 @@ lx_sql_ungetc(struct lx_sql_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -107,6 +125,17 @@ lx_sql_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_sql_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_sql_dynclear(void *buf_opaque) { @@ -146,29 +175,28 @@ lx_sql_dynfree(void *buf_opaque) static enum lx_sql_token z0(struct lx_sql_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case ',': state = S1; break; case '0': @@ -182,12 +210,14 @@ z0(struct lx_sql_lx *lx) case '8': case '9': state = S2; break; case '}': state = S3; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S1: /* e.g. "," */ - lx_sql_ungetc(lx, c); return TOK_SEP; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_SEP; case S2: /* e.g. "0" */ switch ((unsigned char) c) { @@ -201,64 +231,70 @@ z0(struct lx_sql_lx *lx) case '7': case '8': case '9': break; - default: lx_sql_ungetc(lx, c); return TOK_COUNT; + default: lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_COUNT; } break; case S3: /* e.g. "}" */ - lx_sql_ungetc(lx, c); return lx->z = z2, TOK_CLOSECOUNT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSECOUNT; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_SEP; + case S2: return TOK_COUNT; + case S3: return lx->z = z2, TOK_CLOSECOUNT; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_SEP; - case S2: return TOK_COUNT; - case S3: return TOK_CLOSECOUNT; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_sql_token z1(struct lx_sql_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, - S30, S31, S32, S33, S34, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, + S30, S31, S32, S33, S34 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '[': state = S1; break; case '-': state = S3; break; @@ -271,21 +307,21 @@ z1(struct lx_sql_lx *lx) case S1: /* e.g. "[" */ switch ((unsigned char) c) { case ':': state = S6; break; - default: lx_sql_ungetc(lx, c); return TOK_CHAR; + default: lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CHAR; } break; case S2: /* e.g. "\\x00" */ - lx_sql_ungetc(lx, c); return TOK_CHAR; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "-" */ - lx_sql_ungetc(lx, c); return TOK_RANGE; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_RANGE; case S4: /* e.g. "^" */ - lx_sql_ungetc(lx, c); return TOK_INVERT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_INVERT; case S5: /* e.g. "]" */ - lx_sql_ungetc(lx, c); return lx->z = z2, TOK_CLOSEGROUP; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSEGROUP; case S6: /* e.g. "[:" */ switch ((unsigned char) c) { @@ -295,94 +331,120 @@ z1(struct lx_sql_lx *lx) case 'A': state = S10; break; case 'L': state = S11; break; case 'U': state = S12; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S7: /* e.g. "[:S" */ switch ((unsigned char) c) { case 'P': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S8: /* e.g. "[:W" */ switch ((unsigned char) c) { case 'H': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S9: /* e.g. "[:D" */ switch ((unsigned char) c) { case 'I': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S10: /* e.g. "[:A" */ switch ((unsigned char) c) { case 'L': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S11: /* e.g. "[:L" */ switch ((unsigned char) c) { case 'O': state = S19; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S12: /* e.g. "[:U" */ switch ((unsigned char) c) { case 'P': state = S13; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S13: /* e.g. "[:UP" */ switch ((unsigned char) c) { case 'P': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S14: /* e.g. "[:LOW" */ switch ((unsigned char) c) { case 'E': state = S15; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:LOWE" */ switch ((unsigned char) c) { case 'R': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S16: /* e.g. "[:ALPHA" */ switch ((unsigned char) c) { case ':': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S17: /* e.g. "[:ALPHA:" */ switch ((unsigned char) c) { case ']': state = S18; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S18: /* e.g. "[:ALPHA:]" */ - lx_sql_ungetc(lx, c); return TOK_NAMED__CLASS; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; case S19: /* e.g. "[:LO" */ switch ((unsigned char) c) { case 'W': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -390,160 +452,196 @@ z1(struct lx_sql_lx *lx) switch ((unsigned char) c) { case 'N': state = S21; break; case 'P': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S21: /* e.g. "[:ALN" */ switch ((unsigned char) c) { case 'U': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S22: /* e.g. "[:ALP" */ switch ((unsigned char) c) { case 'H': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S23: /* e.g. "[:ALPH" */ switch ((unsigned char) c) { case 'A': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S24: /* e.g. "[:ALNU" */ switch ((unsigned char) c) { case 'M': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S25: /* e.g. "[:DI" */ switch ((unsigned char) c) { case 'G': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S26: /* e.g. "[:DIG" */ switch ((unsigned char) c) { case 'I': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S27: /* e.g. "[:DIGI" */ switch ((unsigned char) c) { case 'T': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S28: /* e.g. "[:WH" */ switch ((unsigned char) c) { case 'I': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S29: /* e.g. "[:WHI" */ switch ((unsigned char) c) { case 'T': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S30: /* e.g. "[:WHIT" */ switch ((unsigned char) c) { case 'E': state = S31; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S31: /* e.g. "[:WHITE" */ switch ((unsigned char) c) { case 'S': state = S7; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S32: /* e.g. "[:SP" */ switch ((unsigned char) c) { case 'A': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S33: /* e.g. "[:SPA" */ switch ((unsigned char) c) { case 'C': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S34: /* e.g. "[:SPAC" */ switch ((unsigned char) c) { case 'E': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; case S3: return TOK_RANGE; case S4: return TOK_INVERT; - case S5: return TOK_CLOSEGROUP; + case S5: return lx->z = z2, TOK_CLOSEGROUP; case S18: return TOK_NAMED__CLASS; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_sql_token z2(struct lx_sql_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '{': state = S2; break; case '[': state = S3; break; @@ -560,56 +658,48 @@ z2(struct lx_sql_lx *lx) break; case S1: /* e.g. "\\x00" */ - lx_sql_ungetc(lx, c); return TOK_CHAR; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "{" */ - lx_sql_ungetc(lx, c); return lx->z = z0, TOK_OPENCOUNT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z0, TOK_OPENCOUNT; case S3: /* e.g. "[" */ - lx_sql_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUP; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUP; case S4: /* e.g. "|" */ - lx_sql_ungetc(lx, c); return TOK_ALT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_ALT; case S5: /* e.g. "+" */ - lx_sql_ungetc(lx, c); return TOK_PLUS; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_PLUS; case S6: /* e.g. "*" */ - lx_sql_ungetc(lx, c); return TOK_STAR; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_STAR; - case S7: /* e.g. "?" */ - lx_sql_ungetc(lx, c); return TOK_OPT; + case S7: /* e.g. "\077" */ + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_OPT; case S8: /* e.g. ")" */ - lx_sql_ungetc(lx, c); return TOK_CLOSESUB; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CLOSESUB; case S9: /* e.g. "(" */ - lx_sql_ungetc(lx, c); return TOK_OPENSUB; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_OPENSUB; case S10: /* e.g. "%" */ - lx_sql_ungetc(lx, c); return TOK_MANY; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_MANY; case S11: /* e.g. "_" */ - lx_sql_ungetc(lx, c); return TOK_ANY; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_ANY; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; - case S2: return TOK_OPENCOUNT; - case S3: return TOK_OPENGROUP; + case S2: return lx->z = z0, TOK_OPENCOUNT; + case S3: return lx->z = z1, TOK_OPENGROUP; case S4: return TOK_ALT; case S5: return TOK_PLUS; case S6: return TOK_STAR; @@ -618,8 +708,23 @@ z2(struct lx_sql_lx *lx) case S9: return TOK_OPENSUB; case S10: return TOK_MANY; case S11: return TOK_ANY; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -747,6 +852,7 @@ lx_sql_init(struct lx_sql_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_sql_dynpop; } enum lx_sql_token diff --git a/src/libre/dialect/sql/parser.c b/src/libre/dialect/sql/parser.c index d380c5b7d..e7a4c2e75 100644 --- a/src/libre/dialect/sql/parser.c +++ b/src/libre/dialect/sql/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -311,7 +311,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: INVERT */ { -#line 303 "src/libre/parser.act" +#line 302 "src/libre/parser.act" ZI203 = '^'; @@ -337,7 +337,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI114 = lex_state->lx.start; @@ -353,7 +353,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -365,7 +365,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZIclass), (ZInode))) { goto ZL1; @@ -401,7 +401,7 @@ p_re__sql(flags flags, lex_state lex_state, act_state act_state, err err, t_ast_ /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -415,7 +415,7 @@ p_re__sql(flags flags, lex_state lex_state, act_state act_state, err err, t_ast_ } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -441,7 +441,7 @@ p_re__sql(flags flags, lex_state lex_state, act_state act_state, err err, t_ast_ { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -484,7 +484,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIclass), (ZInode))) { goto ZL4; @@ -499,7 +499,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; { /* BEGINNING OF ACTION: err-expected-term */ { -#line 694 "src/libre/parser.act" +#line 690 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXTERM; @@ -522,7 +522,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; goto ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms; /* END OF INLINE: expr::character-class::list-of-class-terms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -560,7 +560,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -579,7 +579,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; goto ZL2_expr_C_Clist_Hof_Hpieces; /* END OF INLINE: expr::list-of-pieces */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -606,7 +606,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -670,7 +670,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state case (TOK_OPENGROUP): /* BEGINNING OF EXTRACT: OPENGROUP */ { -#line 319 "src/libre/parser.act" +#line 318 "src/libre/parser.act" ZIopen__start = lex_state->lx.start; ZIopen__end = lex_state->lx.end; @@ -688,7 +688,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZIclass) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZIclass) == NULL) { @@ -713,7 +713,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI154 = ']'; ZIclose__start = lex_state->lx.start; @@ -729,7 +729,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIopen__start)); mark(&act_state->groupend, &(ZIclose__end)); @@ -748,7 +748,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: INVERT */ { -#line 303 "src/libre/parser.act" +#line 302 "src/libre/parser.act" ZI158 = '^'; @@ -760,7 +760,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZImask) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZImask) == NULL) { @@ -789,7 +789,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state case (TOK_CLOSEGROUP): /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI163 = ']'; ZIclose__start = lex_state->lx.start; @@ -809,7 +809,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIopen__start)); mark(&act_state->groupend, &(ZIclose__end)); @@ -823,7 +823,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -839,7 +839,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF INLINE: 162 */ /* BEGINNING OF ACTION: ast-make-subtract */ { -#line 960 "src/libre/parser.act" +#line 959 "src/libre/parser.act" (ZInode) = ast_make_expr_subtract(act_state->poolp, *flags, (ZIclass), (ZImask)); if ((ZInode) == NULL) { @@ -862,7 +862,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -874,7 +874,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: err-expected-closegroup */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -917,7 +917,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, } /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -954,7 +954,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -975,7 +975,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -987,7 +987,7 @@ ZL1:; /* END OF ACTION: err-expected-alts */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1019,7 +1019,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI117 = lex_state->lx.start; @@ -1035,7 +1035,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1047,7 +1047,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZIclass), (ZInode))) { goto ZL1; @@ -1058,7 +1058,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1105,7 +1105,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp { /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1167,7 +1167,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (*ZI205)); if ((ZInode) == NULL) { @@ -1192,7 +1192,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIa).type = AST_ENDPOINT_LITERAL; (ZIa).u.literal.c = (unsigned char) (*ZI205); @@ -1202,7 +1202,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI136 = '-'; ZI137 = lex_state->lx.start; @@ -1220,7 +1220,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI case (TOK_CHAR): /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1243,7 +1243,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIz).type = AST_ENDPOINT_LITERAL; (ZIz).u.literal.c = (unsigned char) (ZIcz); @@ -1253,7 +1253,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI206)); mark(&act_state->rangeend, &(ZIend)); @@ -1263,7 +1263,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -1321,7 +1321,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI176 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -1335,7 +1335,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI209)); mark(&act_state->countend, &(ZIend)); @@ -1345,7 +1345,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((*ZIm) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -1376,7 +1376,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -1408,7 +1408,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_CLOSECOUNT): /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI179 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -1426,7 +1426,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI209)); mark(&act_state->countend, &(ZIend)); @@ -1436,7 +1436,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -1486,7 +1486,7 @@ ZL2_expr_C_Clist_Hof_Halts:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIalts), (ZIa))) { goto ZL1; @@ -1505,7 +1505,7 @@ ZL2_expr_C_Clist_Hof_Halts:; goto ZL2_expr_C_Clist_Hof_Halts; /* END OF INLINE: expr::list-of-alts */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -1517,7 +1517,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -1549,7 +1549,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: OPENCOUNT */ { -#line 371 "src/libre/parser.act" +#line 370 "src/libre/parser.act" ZI209 = lex_state->lx.start; ZI210 = lex_state->lx.end; @@ -1565,7 +1565,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -1605,7 +1605,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-one */ { -#line 817 "src/libre/parser.act" +#line 816 "src/libre/parser.act" (ZIc) = ast_make_count(0, 1); @@ -1619,7 +1619,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-one-or-more */ { -#line 813 "src/libre/parser.act" +#line 812 "src/libre/parser.act" (ZIc) = ast_make_count(1, AST_COUNT_UNBOUNDED); @@ -1633,7 +1633,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -1646,7 +1646,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, { /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -1663,7 +1663,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-count */ { -#line 701 "src/libre/parser.act" +#line 697 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCOUNT; @@ -1675,7 +1675,7 @@ ZL1:; /* END OF ACTION: err-expected-count */ /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -1704,7 +1704,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -1714,7 +1714,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -1734,7 +1734,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1753,7 +1753,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIe) = ast_make_expr_literal(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -1774,7 +1774,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -1784,7 +1784,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIg) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIg) == NULL) { @@ -1796,7 +1796,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -1805,7 +1805,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: count-zero-or-more */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); @@ -1832,7 +1832,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -1846,7 +1846,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZIe) = ast_make_expr_group(act_state->poolp, *flags, (ZIg), (ZIid)); if ((ZIe) == NULL) { @@ -1884,7 +1884,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -1896,7 +1896,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -1932,7 +1932,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hnamed(flags flags, lex_state lex_state, act case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -1956,7 +1956,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hnamed(flags flags, lex_state lex_state, act ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (ZIid)); if ((ZInode) == NULL) { @@ -1986,7 +1986,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZInode) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2007,7 +2007,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2032,7 +2032,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/sql/parser.h b/src/libre/dialect/sql/parser.h index c5e885439..7825ae3af 100644 --- a/src/libre/dialect/sql/parser.h +++ b/src/libre/dialect/sql/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__sql(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/sql/parser.h" diff --git a/src/lx/lexer.c b/src/lx/lexer.c index 03bfbf463..ed7ca6d78 100644 --- a/src/lx/lexer.c +++ b/src/lx/lexer.c @@ -14,6 +14,26 @@ static enum lx_token z2(struct lx *lx); static enum lx_token z3(struct lx *lx); static enum lx_token z4(struct lx *lx); +static int +lx_advance_end(struct lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif @@ -34,18 +54,19 @@ lx_getc(struct lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_getc((struct lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -54,10 +75,7 @@ lx_ungetc(struct lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -67,13 +85,20 @@ lx_ungetc(struct lx *lx, int c) } } +/* Get a character from fgetc and push it to the buffer */ int lx_fgetc(struct lx *lx) { assert(lx != NULL); assert(lx->getc_opaque != NULL); - return fgetc(lx->getc_opaque); + const int c = fgetc(lx->getc_opaque); + if (c == EOF) { + lx->c = EOF; + return EOF; + } else { + return c; + } } int @@ -118,6 +143,17 @@ lx_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_dynclear(void *buf_opaque) { @@ -157,37 +193,36 @@ lx_dynfree(void *buf_opaque) static enum lx_token z0(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '/': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "\057" */ switch ((unsigned char) c) { @@ -243,79 +278,73 @@ z0(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return lx->z = z4, TOK_RE; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, TOK_RE; } break; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return lx->z = z4, TOK_RE; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_RE; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z1(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { - case '"': state = S2; break; - case '\\': state = S3; break; - default: state = S1; break; + case '\\': state = S1; break; + case '"': state = S3; break; + default: state = S2; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; - - case S2: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z4, TOK_STR; - - case S3: /* e.g. "\\" */ + case S1: /* e.g. "\\" */ switch ((unsigned char) c) { - case '"': - case '\\': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': state = S4; break; + case 'x': state = S4; break; case '0': case '1': case '2': @@ -324,29 +353,24 @@ z1(struct lx *lx) case '5': case '6': case '7': state = S5; break; - case 'x': state = S6; break; - default: lx_ungetc(lx, c); return TOK_CHAR; + case '"': + case '\\': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': state = S6; break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S4: /* e.g. "\\f" */ - lx_ungetc(lx, c); return TOK_ESC; + case S2: /* e.g. "\\x00" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; - case S5: /* e.g. "\\0" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': break; - default: lx_ungetc(lx, c); return TOK_OCT; - } - break; + case S3: /* e.g. "\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, TOK_STR; - case S6: /* e.g. "\\x" */ + case S4: /* e.g. "\\x" */ switch ((unsigned char) c) { case '0': case '1': @@ -370,11 +394,30 @@ z1(struct lx *lx) case 'd': case 'e': case 'f': state = S7; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S5: /* e.g. "\\0" */ + switch ((unsigned char) c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S7: /* e.g. "\\xa" */ + case S6: /* e.g. "\\\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ESC; + + case S7: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -398,135 +441,157 @@ z1(struct lx *lx) case 'd': case 'e': case 'f': break; - default: lx_ungetc(lx, c); return TOK_HEX; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_HEX; } break; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_CHAR; + case S3: return lx->z = z4, TOK_STR; + case S5: return TOK_OCT; + case S6: return TOK_ESC; + case S7: return TOK_HEX; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_STR; - case S3: return TOK_CHAR; - case S4: return TOK_ESC; - case S5: return TOK_OCT; - case S7: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z2(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\'': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z4, TOK_STR; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, TOK_STR; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return lx->z = z4, TOK_STR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_STR; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z3(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\n': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return lx->z(lx); + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); case S2: /* e.g. "" */ - lx_ungetc(lx, c); return lx->z = z4, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return lx->z = z4, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S0: @@ -536,75 +601,52 @@ z3(struct lx *lx) default: if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z4(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { - case '\t': - case '\n': - case '\r': - case ' ': state = S1; break; - case '!': state = S2; break; - case '"': state = S3; break; - case '#': state = S4; break; - case '$': state = S5; break; - case '&': state = S6; break; - case '\'': state = S7; break; - case '(': state = S8; break; - case ')': state = S9; break; - case '*': state = S10; break; - case '+': state = S11; break; - case ',': state = S12; break; - case '-': state = S13; break; - case '.': state = S14; break; - case '/': state = S15; break; - case ';': state = S16; break; - case '=': state = S17; break; - case '?': state = S18; break; + case ',': state = S1; break; + case '$': state = S2; break; case 'A': case 'B': case 'C': @@ -657,37 +699,42 @@ z4(struct lx *lx) case 'w': case 'x': case 'y': - case 'z': state = S19; break; - case '\\': state = S20; break; - case '^': state = S21; break; - case '{': state = S22; break; - case '|': state = S23; break; - case '}': state = S24; break; - case '~': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; - - case S1: /* e.g. "\\x09" */ - switch ((unsigned char) c) { + case 'z': state = S3; break; + case '&': state = S4; break; + case '|': state = S5; break; + case '.': state = S6; break; + case '-': state = S7; break; + case '\\': state = S8; break; + case '^': state = S9; break; + case '!': state = S10; break; + case '~': state = S11; break; + case '?': state = S12; break; + case '+': state = S13; break; + case '*': state = S14; break; + case ')': state = S15; break; + case '(': state = S16; break; + case '}': state = S17; break; + case '{': state = S18; break; + case ';': state = S19; break; + case '=': state = S20; break; + case '/': state = S21; break; + case '"': state = S22; break; + case '\'': state = S23; break; + case '#': state = S24; break; case '\t': case '\n': case '\r': - case ' ': break; - default: lx_ungetc(lx, c); return lx->z(lx); + case ' ': state = S25; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S2: /* e.g. "!" */ - lx_ungetc(lx, c); return TOK_BANG; - - case S3: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z1, lx->z(lx); - - case S4: /* e.g. "#" */ - lx_ungetc(lx, c); return lx->z = z3, lx->z(lx); + case S1: /* e.g. "," */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_COMMA; - case S5: /* e.g. "$" */ + case S2: /* e.g. "$" */ switch ((unsigned char) c) { case 'A': case 'B': @@ -742,58 +789,13 @@ z4(struct lx *lx) case 'x': case 'y': case 'z': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S6: /* e.g. "&" */ - lx_ungetc(lx, c); return TOK_AND; - - case S7: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z2, lx->z(lx); - - case S8: /* e.g. "(" */ - lx_ungetc(lx, c); return TOK_LPAREN; - - case S9: /* e.g. ")" */ - lx_ungetc(lx, c); return TOK_RPAREN; - - case S10: /* e.g. "*" */ - lx_ungetc(lx, c); return TOK_STAR; - - case S11: /* e.g. "+" */ - lx_ungetc(lx, c); return TOK_CROSS; - - case S12: /* e.g. "," */ - lx_ungetc(lx, c); return TOK_COMMA; - - case S13: /* e.g. "-" */ - switch ((unsigned char) c) { - case '>': state = S27; break; - default: lx_ungetc(lx, c); return TOK_DASH; - } - break; - - case S14: /* e.g. "." */ - switch ((unsigned char) c) { - case '.': state = S26; break; - default: lx_ungetc(lx, c); return TOK_DOT; - } - break; - - case S15: /* e.g. "\057" */ - lx_ungetc(lx, c); return lx->z = z0, lx->z(lx); - - case S16: /* e.g. ";" */ - lx_ungetc(lx, c); return TOK_SEMI; - - case S17: /* e.g. "=" */ - lx_ungetc(lx, c); return TOK_BIND; - - case S18: /* e.g. "?" */ - lx_ungetc(lx, c); return TOK_QMARK; - - case S19: /* e.g. "a" */ + case S3: /* e.g. "A" */ switch ((unsigned char) c) { case '0': case '1': @@ -858,35 +860,98 @@ z4(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; - case S20: /* e.g. "\\" */ - lx_ungetc(lx, c); return TOK_DASH; + case S4: /* e.g. "&" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_AND; + + case S5: /* e.g. "|" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_PIPE; + + case S6: /* e.g. "." */ + switch ((unsigned char) c) { + case '.': state = S27; break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_DOT; + } + break; + + case S7: /* e.g. "-" */ + switch ((unsigned char) c) { + case '>': state = S26; break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_DASH; + } + break; + + case S8: /* e.g. "\\" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_DASH; + + case S9: /* e.g. "^" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_HAT; - case S21: /* e.g. "^" */ - lx_ungetc(lx, c); return TOK_HAT; + case S10: /* e.g. "!" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_BANG; - case S22: /* e.g. "{" */ - lx_ungetc(lx, c); return TOK_OPEN; + case S11: /* e.g. "~" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TILDE; - case S23: /* e.g. "|" */ - lx_ungetc(lx, c); return TOK_PIPE; + case S12: /* e.g. "\077" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_QMARK; - case S24: /* e.g. "}" */ - lx_ungetc(lx, c); return TOK_CLOSE; + case S13: /* e.g. "+" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CROSS; - case S25: /* e.g. "~" */ - lx_ungetc(lx, c); return TOK_TILDE; + case S14: /* e.g. "*" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_STAR; - case S26: /* e.g. ".." */ - lx_ungetc(lx, c); return TOK_TO; + case S15: /* e.g. ")" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_RPAREN; - case S27: /* e.g. "->" */ - lx_ungetc(lx, c); return TOK_MAP; + case S16: /* e.g. "(" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_LPAREN; - case S28: /* e.g. "$a" */ + case S17: /* e.g. "}" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CLOSE; + + case S18: /* e.g. "{" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_OPEN; + + case S19: /* e.g. ";" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_SEMI; + + case S20: /* e.g. "=" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_BIND; + + case S21: /* e.g. "\057" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z0, lx->z(lx); + + case S22: /* e.g. "\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z1, lx->z(lx); + + case S23: /* e.g. "'" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z2, lx->z(lx); + + case S24: /* e.g. "#" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z3, lx->z(lx); + + case S25: /* e.g. "\\x09" */ + switch ((unsigned char) c) { + case '\t': + case '\n': + case '\r': + case ' ': break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); + } + break; + + case S26: /* e.g. "->" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_MAP; + + case S27: /* e.g. ".." */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TO; + + case S28: /* e.g. "$A" */ switch ((unsigned char) c) { case '0': case '1': @@ -951,66 +1016,73 @@ z4(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return TOK_TOKEN; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TOKEN; } break; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_COMMA; + case S3: return TOK_IDENT; + case S4: return TOK_AND; + case S5: return TOK_PIPE; + case S6: return TOK_DOT; + case S7: return TOK_DASH; + case S8: return TOK_DASH; + case S9: return TOK_HAT; + case S10: return TOK_BANG; + case S11: return TOK_TILDE; + case S12: return TOK_QMARK; + case S13: return TOK_CROSS; + case S14: return TOK_STAR; + case S15: return TOK_RPAREN; + case S16: return TOK_LPAREN; + case S17: return TOK_CLOSE; + case S18: return TOK_OPEN; + case S19: return TOK_SEMI; + case S20: return TOK_BIND; + case S21: return lx->z = z0, lx->z(lx); + case S22: return lx->z = z1, lx->z(lx); + case S23: return lx->z = z2, lx->z(lx); + case S24: return lx->z = z3, lx->z(lx); + case S25: return TOK_EOF; + case S26: return TOK_MAP; + case S27: return TOK_TO; + case S28: return TOK_TOKEN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { - case S1: - case S3: - case S4: - case S7: - case S15: + case S21: + case S22: + case S23: + case S24: + case S25: break; default: if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_BANG; - case S3: return TOK_EOF; - case S4: return TOK_EOF; - case S6: return TOK_AND; - case S7: return TOK_EOF; - case S8: return TOK_LPAREN; - case S9: return TOK_RPAREN; - case S10: return TOK_STAR; - case S11: return TOK_CROSS; - case S12: return TOK_COMMA; - case S13: return TOK_DASH; - case S14: return TOK_DOT; - case S15: return TOK_EOF; - case S16: return TOK_SEMI; - case S17: return TOK_BIND; - case S18: return TOK_QMARK; - case S19: return TOK_IDENT; - case S20: return TOK_DASH; - case S21: return TOK_HAT; - case S22: return TOK_OPEN; - case S23: return TOK_PIPE; - case S24: return TOK_CLOSE; - case S25: return TOK_TILDE; - case S26: return TOK_TO; - case S27: return TOK_MAP; - case S28: return TOK_TOKEN; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -1238,6 +1310,7 @@ lx_init(struct lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_dynpop; } enum lx_token diff --git a/src/lx/parser.c b/src/lx/parser.c index f6f759693..4ab69ce21 100644 --- a/src/lx/parser.c +++ b/src/lx/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 127 "src/lx/parser.act" +#line 27 "src/lx/parser.act" #include @@ -182,7 +182,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-list */ { -#line 816 "src/lx/parser.act" +#line 814 "src/lx/parser.act" err_expected(lex_state, "list of mappings, bindings or zones"); @@ -204,7 +204,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) /* BEGINNING OF EXTRACT: IDENT */ { -#line 228 "src/lx/parser.act" +#line 227 "src/lx/parser.act" ZIn = xstrdup(lex_state->buf.a); @@ -214,7 +214,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: deref-var */ { -#line 280 "src/lx/parser.act" +#line 277 "src/lx/parser.act" struct ast_zone *z; @@ -252,7 +252,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) /* BEGINNING OF EXTRACT: TOKEN */ { -#line 224 "src/lx/parser.act" +#line 222 "src/lx/parser.act" /* TODO: submatch addressing */ ZIt = xstrdup(lex_state->buf.a + 1); /* +1 for '$' prefix */ @@ -263,7 +263,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: deref-token */ { -#line 308 "src/lx/parser.act" +#line 304 "src/lx/parser.act" const struct ast_mapping *m; fsm_state_t start; @@ -374,7 +374,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: op-reverse */ { -#line 677 "src/lx/parser.act" +#line 676 "src/lx/parser.act" assert((ZI210) != NULL); @@ -398,7 +398,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI231) != NULL); @@ -449,7 +449,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: op-complete */ { -#line 668 "src/lx/parser.act" +#line 667 "src/lx/parser.act" assert((ZI210) != NULL); @@ -473,7 +473,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI239) != NULL); @@ -509,7 +509,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z /* BEGINNING OF EXTRACT: IDENT */ { -#line 228 "src/lx/parser.act" +#line 227 "src/lx/parser.act" ZIn = xstrdup(lex_state->buf.a); @@ -563,7 +563,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI248) != NULL); @@ -614,7 +614,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: op-complement */ { -#line 659 "src/lx/parser.act" +#line 658 "src/lx/parser.act" assert((ZI210) != NULL); @@ -638,7 +638,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI223) != NULL); @@ -685,7 +685,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z /* BEGINNING OF EXTRACT: TOKEN */ { -#line 224 "src/lx/parser.act" +#line 222 "src/lx/parser.act" /* TODO: submatch addressing */ ZI253 = xstrdup(lex_state->buf.a + 1); /* +1 for '$' prefix */ @@ -696,7 +696,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z ADVANCE_LEXER; /* BEGINNING OF ACTION: deref-token */ { -#line 308 "src/lx/parser.act" +#line 304 "src/lx/parser.act" const struct ast_mapping *m; fsm_state_t start; @@ -769,7 +769,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI259) != NULL); @@ -829,7 +829,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI268) != NULL); @@ -869,7 +869,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-thing */ { -#line 812 "src/lx/parser.act" +#line 810 "src/lx/parser.act" err_expected(lex_state, "mapping, binding or zone"); @@ -899,7 +899,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-open */ { -#line 800 "src/lx/parser.act" +#line 798 "src/lx/parser.act" err_expected(lex_state, "'{'"); @@ -929,7 +929,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-close */ { -#line 804 "src/lx/parser.act" +#line 802 "src/lx/parser.act" err_expected(lex_state, "'}'"); @@ -955,7 +955,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: CHAR */ { -#line 219 "src/lx/parser.act" +#line 215 "src/lx/parser.act" assert(lex_state->buf.a[0] != '\0'); assert(lex_state->buf.a[1] == '\0'); @@ -972,7 +972,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: ESC */ { -#line 149 "src/lx/parser.act" +#line 143 "src/lx/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1000,7 +1000,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: HEX */ { -#line 212 "src/lx/parser.act" +#line 188 "src/lx/parser.act" unsigned long u; char *e; @@ -1037,7 +1037,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: OCT */ { -#line 185 "src/lx/parser.act" +#line 161 "src/lx/parser.act" unsigned long u; char *e; @@ -1077,7 +1077,7 @@ ZL2_pattern_C_Cbody:; /* END OF INLINE: 84 */ /* BEGINNING OF ACTION: pattern-char */ { -#line 249 "src/lx/parser.act" +#line 247 "src/lx/parser.act" /* TODO */ *lex_state->p++ = (ZIc); @@ -1089,7 +1089,7 @@ ZL2_pattern_C_Cbody:; goto ZL2_pattern_C_Cbody; /* END OF INLINE: pattern::body */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -1124,7 +1124,7 @@ ZL2_174:; } /* END OF INLINE: 174 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI171 = ZI168; @@ -1211,7 +1211,7 @@ ZL2_180:; } /* BEGINNING OF ACTION: op-alt */ { -#line 725 "src/lx/parser.act" +#line 724 "src/lx/parser.act" assert((ZI177) != NULL); assert((ZIb) != NULL); @@ -1230,7 +1230,7 @@ ZL2_180:; goto ZL2_180; /* END OF INLINE: 180 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI178 = ZI176; @@ -1291,7 +1291,7 @@ p_expr_C_Cprefix_Hexpr(lex_state lex_state, act_state act_state, zone ZIz, fsm * } /* BEGINNING OF ACTION: op-reverse */ { -#line 677 "src/lx/parser.act" +#line 676 "src/lx/parser.act" assert((ZIq) != NULL); @@ -1315,7 +1315,7 @@ p_expr_C_Cprefix_Hexpr(lex_state lex_state, act_state act_state, zone ZIz, fsm * } /* BEGINNING OF ACTION: op-complete */ { -#line 668 "src/lx/parser.act" +#line 667 "src/lx/parser.act" assert((ZIq) != NULL); @@ -1339,7 +1339,7 @@ p_expr_C_Cprefix_Hexpr(lex_state lex_state, act_state act_state, zone ZIz, fsm * } /* BEGINNING OF ACTION: op-complement */ { -#line 659 "src/lx/parser.act" +#line 658 "src/lx/parser.act" assert((ZIq) != NULL); @@ -1398,7 +1398,7 @@ ZL2_186:; } /* BEGINNING OF ACTION: op-intersect */ { -#line 714 "src/lx/parser.act" +#line 713 "src/lx/parser.act" assert((ZI183) != NULL); assert((ZIb) != NULL); @@ -1417,7 +1417,7 @@ ZL2_186:; goto ZL2_186; /* END OF INLINE: 186 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI184 = ZI182; @@ -1455,7 +1455,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hto_Hmappings_C_Clist_Hof_ } /* BEGINNING OF ACTION: op-alt */ { -#line 725 "src/lx/parser.act" +#line 724 "src/lx/parser.act" assert((ZIold_Hexit) != NULL); assert((ZInew_Hexit) != NULL); @@ -1473,7 +1473,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hto_Hmappings_C_Clist_Hof_ goto ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hto_Hmappings_C_Clist_Hof_Hzone_Hto_Hmappings_Hx; /* END OF INLINE: list-of-things::zone-thing::list-of-zone-to-mappings::list-of-zone-to-mappings-x */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -1554,7 +1554,7 @@ ZL2_197:; ADVANCE_LEXER; /* BEGINNING OF ACTION: op-cross */ { -#line 575 "src/lx/parser.act" +#line 568 "src/lx/parser.act" fsm_state_t start, end; fsm_state_t old; @@ -1607,13 +1607,13 @@ ZL2_197:; goto ZL2_197; /* END OF INLINE: 197 */ } - /*UNREACHED*/ + /* UNREACHED */ case (TOK_QMARK): { ADVANCE_LEXER; /* BEGINNING OF ACTION: op-qmark */ { -#line 620 "src/lx/parser.act" +#line 613 "src/lx/parser.act" fsm_state_t start, end; fsm_state_t old; @@ -1666,13 +1666,13 @@ ZL2_197:; goto ZL2_197; /* END OF INLINE: 197 */ } - /*UNREACHED*/ + /* UNREACHED */ case (TOK_STAR): { ADVANCE_LEXER; /* BEGINNING OF ACTION: op-star */ { -#line 525 "src/lx/parser.act" +#line 518 "src/lx/parser.act" fsm_state_t start, end; fsm_state_t old; @@ -1730,14 +1730,14 @@ ZL2_197:; goto ZL2_197; /* END OF INLINE: 197 */ } - /*UNREACHED*/ + /* UNREACHED */ default: goto ZL1; } } /* END OF INLINE: 272 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI196 = ZI191; @@ -1795,7 +1795,7 @@ p_204(lex_state lex_state, act_state act_state, zone *ZIz, fsm *ZI202, fsm *ZOq) } /* BEGINNING OF ACTION: op-subtract */ { -#line 703 "src/lx/parser.act" +#line 702 "src/lx/parser.act" assert((*ZI202) != NULL); assert((ZIb) != NULL); @@ -1870,7 +1870,7 @@ p_208(lex_state lex_state, act_state act_state, zone *ZIz, fsm *ZI206, fsm *ZOq) } /* BEGINNING OF ACTION: op-concat */ { -#line 686 "src/lx/parser.act" +#line 685 "src/lx/parser.act" assert((*ZI206) != NULL); assert((ZIb) != NULL); @@ -1920,7 +1920,7 @@ p_212(lex_state lex_state, act_state act_state, zone *ZIz, fsm *ZI210, fsm *ZOq) } /* BEGINNING OF ACTION: op-product */ { -#line 698 "src/lx/parser.act" +#line 696 "src/lx/parser.act" fprintf(stderr, "unimplemented\n"); (ZIq) = NULL; @@ -1960,7 +1960,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* BEGINNING OF EXTRACT: RE */ { -#line 236 "src/lx/parser.act" +#line 231 "src/lx/parser.act" assert(lex_state->buf.a[0] == '/'); @@ -1980,7 +1980,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: pattern-buffer */ { -#line 263 "src/lx/parser.act" +#line 252 "src/lx/parser.act" size_t len; @@ -2010,7 +2010,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: pattern-buffer */ /* BEGINNING OF ACTION: compile-regex */ { -#line 379 "src/lx/parser.act" +#line 376 "src/lx/parser.act" struct re_err err; @@ -2030,7 +2030,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: compile-regex */ /* BEGINNING OF ACTION: free-arr */ { -#line 766 "src/lx/parser.act" +#line 765 "src/lx/parser.act" free((ZIa)); @@ -2046,7 +2046,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: pattern-buffer */ { -#line 263 "src/lx/parser.act" +#line 252 "src/lx/parser.act" size_t len; @@ -2076,7 +2076,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: pattern-buffer */ /* BEGINNING OF ACTION: compile-literal */ { -#line 364 "src/lx/parser.act" +#line 361 "src/lx/parser.act" struct re_err err; @@ -2096,7 +2096,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: compile-literal */ /* BEGINNING OF ACTION: free-arr */ { -#line 766 "src/lx/parser.act" +#line 765 "src/lx/parser.act" free((ZIa)); @@ -2164,7 +2164,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Ho } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZIr) != NULL); @@ -2189,7 +2189,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Ho /* END OF ACTION: subtract-exit */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2229,7 +2229,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Ho goto ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Hof_Hzone_Hfrom_Hmappings_Hx; /* END OF INLINE: list-of-things::zone-thing::list-of-zone-from-mappings::list-of-zone-from-mappings-x */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -2256,7 +2256,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* BEGINNING OF ACTION: no-zone */ { -#line 515 "src/lx/parser.act" +#line 514 "src/lx/parser.act" (ZIparent) = NULL; @@ -2265,7 +2265,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: no-zone */ /* BEGINNING OF ACTION: make-ast */ { -#line 424 "src/lx/parser.act" +#line 423 "src/lx/parser.act" (ZIa) = ast_new(); if ((ZIa) == NULL) { @@ -2278,7 +2278,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: make-ast */ /* BEGINNING OF ACTION: make-zone */ { -#line 432 "src/lx/parser.act" +#line 431 "src/lx/parser.act" assert((ZIa) != NULL); @@ -2301,7 +2301,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: make-zone */ /* BEGINNING OF ACTION: no-exit */ { -#line 511 "src/lx/parser.act" +#line 510 "src/lx/parser.act" (ZIexit) = NULL; @@ -2310,7 +2310,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: no-exit */ /* BEGINNING OF ACTION: set-globalzone */ { -#line 500 "src/lx/parser.act" +#line 499 "src/lx/parser.act" assert((ZIa) != NULL); assert((ZIz) != NULL); @@ -2341,7 +2341,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 808 "src/lx/parser.act" +#line 806 "src/lx/parser.act" err_expected(lex_state, "EOF"); @@ -2358,7 +2358,7 @@ ZL1:; { /* BEGINNING OF ACTION: make-ast */ { -#line 424 "src/lx/parser.act" +#line 423 "src/lx/parser.act" (ZIa) = ast_new(); if ((ZIa) == NULL) { @@ -2371,7 +2371,7 @@ ZL1:; /* END OF ACTION: make-ast */ /* BEGINNING OF ACTION: err-syntax */ { -#line 776 "src/lx/parser.act" +#line 773 "src/lx/parser.act" err(lex_state, "Syntax error"); exit(EXIT_FAILURE); @@ -2433,7 +2433,7 @@ p_list_Hof_Hthings_C_Czone_Hthing_C_Czone_Hto_Hmapping(lex_state lex_state, act_ } /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2471,7 +2471,7 @@ p_list_Hof_Hthings_C_Czone_Hthing_C_Czone_Hto_Hmapping(lex_state lex_state, act_ /* END OF ACTION: add-mapping */ /* BEGINNING OF ACTION: clone */ { -#line 756 "src/lx/parser.act" +#line 755 "src/lx/parser.act" assert((ZIr) != NULL); @@ -2517,7 +2517,7 @@ p_112(lex_state lex_state, act_state act_state, string *ZOt) { /* BEGINNING OF ACTION: err-expected-map */ { -#line 784 "src/lx/parser.act" +#line 782 "src/lx/parser.act" err_expected(lex_state, "'->'"); @@ -2532,7 +2532,7 @@ p_112(lex_state lex_state, act_state act_state, string *ZOt) case (TOK_TOKEN): /* BEGINNING OF EXTRACT: TOKEN */ { -#line 224 "src/lx/parser.act" +#line 222 "src/lx/parser.act" /* TODO: submatch addressing */ ZIt = xstrdup(lex_state->buf.a + 1); /* +1 for '$' prefix */ @@ -2551,7 +2551,7 @@ p_112(lex_state lex_state, act_state act_state, string *ZOt) { /* BEGINNING OF ACTION: no-token */ { -#line 507 "src/lx/parser.act" +#line 506 "src/lx/parser.act" (ZIt) = NULL; @@ -2644,7 +2644,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-semi */ { -#line 792 "src/lx/parser.act" +#line 790 "src/lx/parser.act" err_expected(lex_state, "';'"); @@ -2695,7 +2695,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: no-zone */ { -#line 515 "src/lx/parser.act" +#line 514 "src/lx/parser.act" (ZIto) = NULL; @@ -2704,7 +2704,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-zone */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2749,7 +2749,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* BEGINNING OF ACTION: make-zone */ { -#line 432 "src/lx/parser.act" +#line 431 "src/lx/parser.act" assert((*ZIa) != NULL); @@ -2772,7 +2772,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: make-zone */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2829,7 +2829,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-to */ { -#line 796 "src/lx/parser.act" +#line 794 "src/lx/parser.act" err_expected(lex_state, "'..'"); @@ -2858,7 +2858,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: no-zone */ { -#line 515 "src/lx/parser.act" +#line 514 "src/lx/parser.act" (ZIx) = NULL; @@ -2867,7 +2867,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-zone */ /* BEGINNING OF ACTION: no-token */ { -#line 507 "src/lx/parser.act" +#line 506 "src/lx/parser.act" (ZIy) = NULL; @@ -2876,7 +2876,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-token */ /* BEGINNING OF ACTION: regex-any */ { -#line 395 "src/lx/parser.act" +#line 392 "src/lx/parser.act" fsm_state_t start, end; @@ -2912,7 +2912,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: regex-any */ /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZIw) != NULL); @@ -2937,7 +2937,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: subtract-exit */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2997,7 +2997,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-list */ { -#line 816 "src/lx/parser.act" +#line 814 "src/lx/parser.act" err_expected(lex_state, "list of mappings, bindings or zones"); @@ -3017,7 +3017,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* BEGINNING OF ACTION: no-exit */ { -#line 511 "src/lx/parser.act" +#line 510 "src/lx/parser.act" (ZIr2) = NULL; @@ -3026,7 +3026,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-exit */ /* BEGINNING OF ACTION: make-zone */ { -#line 432 "src/lx/parser.act" +#line 431 "src/lx/parser.act" assert((*ZIa) != NULL); @@ -3049,7 +3049,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: make-zone */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -3101,7 +3101,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-list */ { -#line 816 "src/lx/parser.act" +#line 814 "src/lx/parser.act" err_expected(lex_state, "list of mappings, bindings or zones"); @@ -3149,7 +3149,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-bind */ { -#line 788 "src/lx/parser.act" +#line 786 "src/lx/parser.act" err_expected(lex_state, "'='"); @@ -3168,7 +3168,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: add-binding */ { -#line 485 "src/lx/parser.act" +#line 482 "src/lx/parser.act" struct var *v; @@ -3211,7 +3211,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* BEGINNING OF ACTION: deref-var */ { -#line 280 "src/lx/parser.act" +#line 277 "src/lx/parser.act" struct ast_zone *z; @@ -3254,7 +3254,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI278) != NULL); @@ -3297,7 +3297,7 @@ ZL1:; /* BEGINNING OF TRAILER */ -#line 880 "src/lx/parser.act" +#line 818 "src/lx/parser.act" struct ast *lx_parse(FILE *f, const struct fsm_alloc *alloc) { diff --git a/src/lx/parser.h b/src/lx/parser.h index fdaff9879..947b194b5 100644 --- a/src/lx/parser.h +++ b/src/lx/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 139 "src/lx/parser.act" +#line 127 "src/lx/parser.act" #include @@ -29,7 +29,7 @@ extern void p_lx(lex_state, act_state, ast *); /* BEGINNING OF TRAILER */ -#line 882 "src/lx/parser.act" +#line 880 "src/lx/parser.act" #line 36 "src/lx/parser.h" From 051c362412961984bcad53fdfd25202a98048df9 Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 28 Aug 2025 21:27:27 +0100 Subject: [PATCH 27/80] Typo. --- man/fsm.1/fsm.1.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/fsm.1/fsm.1.xml b/man/fsm.1/fsm.1.xml index 776aaaab9..caec5309f 100644 --- a/man/fsm.1/fsm.1.xml +++ b/man/fsm.1/fsm.1.xml @@ -22,6 +22,7 @@ io"> iterations"> length"> + limit"> charset"> -a"> @@ -33,7 +34,7 @@ -G &length.arg;"> -k &io.arg;"> -i &iterations.arg;"> - -S &limit.arg;"> + -S &limit.arg;"> -U &charset.arg;"> -X"> From 87f6df26550658d51da875e286efc4781cd73f75 Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 28 Aug 2025 21:28:34 +0100 Subject: [PATCH 28/80] Stray const. --- src/libfsm/print/c.c | 2 +- src/libfsm/print/vmc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 7f9274792..f55b2e748 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -581,7 +581,7 @@ fsm_print_c(FILE *f, case AMBIG_ERROR: case AMBIG_EARLIEST: fprintf(f, ",\n"); - fprintf(f, "\tconst unsigned *id"); + fprintf(f, "\tunsigned *id"); break; case AMBIG_MULTIPLE: diff --git a/src/libfsm/print/vmc.c b/src/libfsm/print/vmc.c index 9ea06f27a..b427a9cb8 100644 --- a/src/libfsm/print/vmc.c +++ b/src/libfsm/print/vmc.c @@ -588,7 +588,7 @@ fsm_print_vmc(FILE *f, case AMBIG_ERROR: case AMBIG_EARLIEST: fprintf(f, ",\n"); - fprintf(f, "\tconst unsigned *id"); + fprintf(f, "\tunsigned *id"); break; case AMBIG_MULTIPLE: From 664b32cd567cbb884a25bc9d1f15e3411183e5e8 Mon Sep 17 00:00:00 2001 From: Dmitry Atamanov Date: Thu, 11 Sep 2025 01:04:25 +0500 Subject: [PATCH 29/80] Update to Unicode 17.0 --- share/ucd/CaseFolding.txt | 40 ++- share/ucd/Makefile | 2 +- share/ucd/Scripts.txt | 142 ++++++--- share/ucd/UnicodeData.txt | 489 ++++++++++++++++++++++++++++- src/libre/class.h | 4 + src/libre/class/utf8_Arabic.c | 15 +- src/libre/class/utf8_Beria_Erfe.c | 14 + src/libre/class/utf8_Common.c | 30 +- src/libre/class/utf8_Han.c | 9 +- src/libre/class/utf8_Inherited.c | 3 +- src/libre/class/utf8_Kannada.c | 2 +- src/libre/class/utf8_L.c | 40 ++- src/libre/class/utf8_Latin.c | 7 +- src/libre/class/utf8_Ll.c | 4 +- src/libre/class/utf8_Lm.c | 6 +- src/libre/class/utf8_Lo.c | 32 +- src/libre/class/utf8_Lu.c | 4 + src/libre/class/utf8_M.c | 10 +- src/libre/class/utf8_Mc.c | 3 + src/libre/class/utf8_Mn.c | 12 +- src/libre/class/utf8_N.c | 2 + src/libre/class/utf8_Nd.c | 1 + src/libre/class/utf8_Nl.c | 3 +- src/libre/class/utf8_P.c | 1 + src/libre/class/utf8_Po.c | 1 + src/libre/class/utf8_S.c | 36 ++- src/libre/class/utf8_Sc.c | 2 +- src/libre/class/utf8_Sharada.c | 3 +- src/libre/class/utf8_Sidetic.c | 13 + src/libre/class/utf8_Sm.c | 4 +- src/libre/class/utf8_So.c | 32 +- src/libre/class/utf8_Tai_Yo.c | 15 + src/libre/class/utf8_Tangut.c | 6 +- src/libre/class/utf8_Telugu.c | 2 +- src/libre/class/utf8_Tolong_Siki.c | 14 + src/libre/class/utf8_assigned.c | 83 ++--- src/libre/class_name.c | 4 + 37 files changed, 881 insertions(+), 209 deletions(-) create mode 100644 src/libre/class/utf8_Beria_Erfe.c create mode 100644 src/libre/class/utf8_Sidetic.c create mode 100644 src/libre/class/utf8_Tai_Yo.c create mode 100644 src/libre/class/utf8_Tolong_Siki.c diff --git a/share/ucd/CaseFolding.txt b/share/ucd/CaseFolding.txt index 1b7a9c156..a0b0f07fd 100644 --- a/share/ucd/CaseFolding.txt +++ b/share/ucd/CaseFolding.txt @@ -1,6 +1,6 @@ -# CaseFolding-16.0.0.txt -# Date: 2024-04-30, 21:48:11 GMT -# © 2024 Unicode®, Inc. +# CaseFolding-17.0.0.txt +# Date: 2025-07-30, 23:54:36 GMT +# © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # @@ -18,15 +18,15 @@ # The data supports both implementations that require simple case foldings # (where string lengths don't change), and implementations that allow full case folding # (where string lengths may grow). Note that where they can be supported, the -# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# full case foldings are superior: for example, they allow "FUSS" and "Fuß" to match. # # All code points not listed in this file map to themselves. # # NOTE: case folding does not preserve normalization formats! # # For information on case folding, including how to have case folding -# preserve normalization formats, see Section 3.13 Default Case Algorithms in -# The Unicode Standard. +# preserve normalization formats, see the +# "Conformance" / "Default Case Algorithms" section of the core specification. # # ================================================================================ # Format @@ -1243,7 +1243,10 @@ A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE +A7CE; C; A7CF; # LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D2; C; A7D3; # LATIN CAPITAL LETTER DOUBLE THORN +A7D4; C; A7D5; # LATIN CAPITAL LETTER DOUBLE WYNN A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA @@ -1616,6 +1619,31 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O 16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI 16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +16EA0; C; 16EBB; # BERIA ERFE CAPITAL LETTER ARKAB +16EA1; C; 16EBC; # BERIA ERFE CAPITAL LETTER BASIGNA +16EA2; C; 16EBD; # BERIA ERFE CAPITAL LETTER DARBAI +16EA3; C; 16EBE; # BERIA ERFE CAPITAL LETTER EH +16EA4; C; 16EBF; # BERIA ERFE CAPITAL LETTER FITKO +16EA5; C; 16EC0; # BERIA ERFE CAPITAL LETTER GOWAY +16EA6; C; 16EC1; # BERIA ERFE CAPITAL LETTER HIRDEABO +16EA7; C; 16EC2; # BERIA ERFE CAPITAL LETTER I +16EA8; C; 16EC3; # BERIA ERFE CAPITAL LETTER DJAI +16EA9; C; 16EC4; # BERIA ERFE CAPITAL LETTER KOBO +16EAA; C; 16EC5; # BERIA ERFE CAPITAL LETTER LAKKO +16EAB; C; 16EC6; # BERIA ERFE CAPITAL LETTER MERI +16EAC; C; 16EC7; # BERIA ERFE CAPITAL LETTER NINI +16EAD; C; 16EC8; # BERIA ERFE CAPITAL LETTER GNA +16EAE; C; 16EC9; # BERIA ERFE CAPITAL LETTER NGAY +16EAF; C; 16ECA; # BERIA ERFE CAPITAL LETTER OI +16EB0; C; 16ECB; # BERIA ERFE CAPITAL LETTER PI +16EB1; C; 16ECC; # BERIA ERFE CAPITAL LETTER ERIGO +16EB2; C; 16ECD; # BERIA ERFE CAPITAL LETTER ERIGO TAMURA +16EB3; C; 16ECE; # BERIA ERFE CAPITAL LETTER SERI +16EB4; C; 16ECF; # BERIA ERFE CAPITAL LETTER SHEP +16EB5; C; 16ED0; # BERIA ERFE CAPITAL LETTER TATASOUE +16EB6; C; 16ED1; # BERIA ERFE CAPITAL LETTER UI +16EB7; C; 16ED2; # BERIA ERFE CAPITAL LETTER WASSE +16EB8; C; 16ED3; # BERIA ERFE CAPITAL LETTER AY 1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF 1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI 1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM diff --git a/share/ucd/Makefile b/share/ucd/Makefile index 833910330..22aa0a869 100644 --- a/share/ucd/Makefile +++ b/share/ucd/Makefile @@ -1,4 +1,4 @@ -UCD_URL ?= https://www.unicode.org/Public/16.0.0/ucd/ +UCD_URL ?= https://www.unicode.org/Public/17.0.0/ucd/ WGET ?= wget diff --git a/share/ucd/Scripts.txt b/share/ucd/Scripts.txt index 443a6d2dd..5574fdd6a 100644 --- a/share/ucd/Scripts.txt +++ b/share/ucd/Scripts.txt @@ -1,6 +1,6 @@ -# Scripts-16.0.0.txt -# Date: 2024-04-30, 21:48:40 GMT -# © 2024 Unicode®, Inc. +# Scripts-17.0.0.txt +# Date: 2025-07-24, 13:28:55 GMT +# © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # @@ -154,7 +154,7 @@ 208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS -20A0..20C0 ; Common # Sc [33] EURO-CURRENCY SIGN..SOM SIGN +20A0..20C1 ; Common # Sc [34] EURO-CURRENCY SIGN..SAUDI RIYAL SIGN 2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2102 ; Common # L& DOUBLE-STRUCK CAPITAL C 2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA @@ -306,8 +306,7 @@ 2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR 2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR -2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW -2B97..2BFF ; Common # So [105] SYMBOL FOR TYPE A ELECTRONICS..HELLSCHREIBER PAUSE SYMBOL +2B76..2BFF ; Common # So [138] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..HELLSCHREIBER PAUSE SYMBOL 2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET @@ -524,7 +523,11 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR 1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1CC00..1CCEF ; Common # So [240] UP-POINTING GO-KART..OUTLINED LATIN CAPITAL LETTER Z 1CCF0..1CCF9 ; Common # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE +1CCFA..1CCFC ; Common # So [3] SNAKE SYMBOL..NOSE SYMBOL 1CD00..1CEB3 ; Common # So [436] BLOCK OCTANT-3..BLACK RIGHT TRIANGLE CARET +1CEBA..1CED0 ; Common # So [23] FRAGILE SYMBOL..LEUKOTHEA +1CEE0..1CEEF ; Common # So [16] GEOMANTIC FIGURE POPULUS..GEOMANTIC FIGURE VIA +1CEF0 ; Common # Sm MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR 1CF50..1CFC3 ; Common # So [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK 1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 @@ -605,11 +608,10 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR 1F260..1F265 ; Common # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI 1F300..1F3FA ; Common # So [251] CYCLONE..AMPHORA 1F3FB..1F3FF ; Common # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 -1F400..1F6D7 ; Common # So [728] RAT..ELEVATOR +1F400..1F6D8 ; Common # So [729] RAT..LANDSLIDE 1F6DC..1F6EC ; Common # So [17] WIRELESS..AIRPLANE ARRIVING 1F6F0..1F6FC ; Common # So [13] SATELLITE..ROLLER SKATE -1F700..1F776 ; Common # So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE -1F77B..1F7D9 ; Common # So [95] HAUMEA..NINE POINTED WHITE STAR +1F700..1F7D9 ; Common # So [218] ALCHEMICAL SYMBOL FOR QUINTESSENCE..NINE POINTED WHITE STAR 1F7E0..1F7EB ; Common # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE 1F7F0 ; Common # So HEAVY EQUALS SIGN 1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD @@ -619,21 +621,24 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR 1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS 1F8B0..1F8BB ; Common # So [12] ARROW POINTING UPWARDS THEN NORTH WEST..SOUTH WEST ARROW FROM BAR 1F8C0..1F8C1 ; Common # So [2] LEFTWARDS ARROW FROM DOWNWARDS ARROW..RIGHTWARDS ARROW FROM DOWNWARDS ARROW -1F900..1FA53 ; Common # So [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP +1F8D0..1F8D8 ; Common # Sm [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE +1F900..1FA57 ; Common # So [344] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS ALFIL 1FA60..1FA6D ; Common # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER 1FA70..1FA7C ; Common # So [13] BALLET SHOES..CRUTCH -1FA80..1FA89 ; Common # So [10] YO-YO..HARP -1FA8F..1FAC6 ; Common # So [56] SHOVEL..FINGERPRINT -1FACE..1FADC ; Common # So [15] MOOSE..ROOT VEGETABLE -1FADF..1FAE9 ; Common # So [11] SPLATTER..FACE WITH BAGS UNDER EYES -1FAF0..1FAF8 ; Common # So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND +1FA80..1FA8A ; Common # So [11] YO-YO..TROMBONE +1FA8E..1FAC6 ; Common # So [57] TREASURE CHEST..FINGERPRINT +1FAC8 ; Common # So HAIRY CREATURE +1FACD..1FADC ; Common # So [16] ORCA..ROOT VEGETABLE +1FADF..1FAEA ; Common # So [12] SPLATTER..DISTORTED FACE +1FAEF..1FAF8 ; Common # So [10] FIGHT CLOUD..RIGHTWARDS PUSHING HAND 1FB00..1FB92 ; Common # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK 1FB94..1FBEF ; Common # So [92] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..TOP LEFT JUSTIFIED LOWER RIGHT QUARTER BLACK CIRCLE 1FBF0..1FBF9 ; Common # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE +1FBFA ; Common # So ALARM BELL SYMBOL E0001 ; Common # Cf LANGUAGE TAG E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG -# Total code points: 9053 +# Total code points: 9123 # ================================================ @@ -648,8 +653,8 @@ E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG 01BC..01BF ; Latin # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; Latin # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; Latin # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL -0294 ; Latin # Lo LATIN LETTER GLOTTAL STOP -0295..02AF ; Latin # L& [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL +0294..0295 ; Latin # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE +0296..02AF ; Latin # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02B8 ; Latin # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 02E0..02E4 ; Latin # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 1D00..1D25 ; Latin # L& [38] LATIN LETTER SMALL CAPITAL A..LATIN LETTER AIN @@ -676,11 +681,8 @@ A770 ; Latin # Lm MODIFIER LETTER US A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; Latin # Lo LATIN LETTER SINOLOGICAL DOT -A790..A7CD ; Latin # L& [62] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH DIAGONAL STROKE -A7D0..A7D1 ; Latin # L& [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G -A7D3 ; Latin # L& LATIN SMALL LETTER DOUBLE THORN -A7D5..A7DC ; Latin # L& [8] LATIN SMALL LETTER DOUBLE WYNN..LATIN CAPITAL LETTER LAMBDA WITH STROKE -A7F2..A7F4 ; Latin # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q +A790..A7DC ; Latin # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE +A7F1..A7F4 ; Latin # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; Latin # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE @@ -702,7 +704,7 @@ FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN 1DF0B..1DF1E ; Latin # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Latin # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK -# Total code points: 1487 +# Total code points: 1492 # ================================================ @@ -869,7 +871,7 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU 0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE 0870..0887 ; Arabic # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0888 ; Arabic # Sk ARABIC RAISED ROUND DOT -0889..088E ; Arabic # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL +0889..088F ; Arabic # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 0890..0891 ; Arabic # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 0897..089F ; Arabic # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; Arabic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF @@ -878,11 +880,13 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU 08E3..08FF ; Arabic # Mn [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBB2..FBC2 ; Arabic # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE +FBC3..FBD2 ; Arabic # So [16] ARABIC LIGATURE JALLA WA-ALAA..ARABIC LIGATURE ALAYHI AR-RAHMAH FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD40..FD4F ; Arabic # So [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH FD50..FD8F ; Arabic # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM +FD90..FD91 ; Arabic # So [2] ARABIC LIGATURE RAHMATU ALLAAHI ALAYH..ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM -FDCF ; Arabic # So ARABIC LIGATURE SALAAMUHU ALAYNAA +FDC8..FDCF ; Arabic # So [8] ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA..ARABIC LIGATURE SALAAMUHU ALAYNAA FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FDFC ; Arabic # Sc RIAL SIGN FDFD..FDFF ; Arabic # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL @@ -890,7 +894,11 @@ FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS 10EC2..10EC4 ; Arabic # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW -10EFC..10EFF ; Arabic # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA +10EC5 ; Arabic # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW +10EC6..10EC7 ; Arabic # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW +10ED0 ; Arabic # Po ARABIC BIBLICAL END OF VERSE +10ED1..10ED8 ; Arabic # So [8] ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM..ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH +10EFA..10EFF ; Arabic # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 1EE00..1EE03 ; Arabic # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Arabic # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Arabic # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM @@ -926,7 +934,7 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA 1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL -# Total code points: 1373 +# Total code points: 1413 # ================================================ @@ -1155,7 +1163,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY 0C4A..0C4D ; Telugu # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; Telugu # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C58..0C5A ; Telugu # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA -0C5D ; Telugu # Lo TELUGU LETTER NAKAARA POLLU +0C5C..0C5D ; Telugu # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; Telugu # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C62..0C63 ; Telugu # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C66..0C6F ; Telugu # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE @@ -1163,7 +1171,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY 0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0C7F ; Telugu # So TELUGU SIGN TUUMU -# Total code points: 100 +# Total code points: 101 # ================================================ @@ -1186,14 +1194,14 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY 0CCA..0CCB ; Kannada # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; Kannada # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; Kannada # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK -0CDD..0CDE ; Kannada # Lo [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA +0CDC..0CDE ; Kannada # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; Kannada # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE2..0CE3 ; Kannada # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; Kannada # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT -# Total code points: 91 +# Total code points: 92 # ================================================ @@ -1594,17 +1602,18 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI 16FE2 ; Han # Po OLD CHINESE HOOK MARK 16FE3 ; Han # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Han # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; Han # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF4..16FF6 ; Han # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 20000..2A6DF ; Han # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF -2A700..2B739 ; Han # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 -2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D -2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 +2A700..2B81D ; Han # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D +2B820..2CEAD ; Han # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Han # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Han # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; Han # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A -31350..323AF ; Han # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF +31350..33479 ; Han # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 -# Total code points: 99030 +# Total code points: 103351 # ================================================ @@ -1647,7 +1656,8 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE 0951..0954 ; Inherited # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT 1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY -1ABF..1ACE ; Inherited # Mn [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T +1ABF..1ADD ; Inherited # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW +1AE0..1AEB ; Inherited # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL @@ -1676,7 +1686,7 @@ FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CON 1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 657 +# Total code points: 684 # ================================================ @@ -2347,8 +2357,14 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI 111DB ; Sharada # Po SHARADA SIGN SIDDHAM 111DC ; Sharada # Lo SHARADA HEADSTROKE 111DD..111DF ; Sharada # Po [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 +11B60 ; Sharada # Mn SHARADA VOWEL SIGN OE +11B61 ; Sharada # Mc SHARADA VOWEL SIGN OOE +11B62..11B64 ; Sharada # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E +11B65 ; Sharada # Mc SHARADA VOWEL SIGN SHORT O +11B66 ; Sharada # Mn SHARADA VOWEL SIGN CANDRA E +11B67 ; Sharada # Mc SHARADA VOWEL SIGN CANDRA O -# Total code points: 96 +# Total code points: 104 # ================================================ @@ -2756,11 +2772,11 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI # ================================================ 16FE0 ; Tangut # Lm TANGUT ITERATION MARK -17000..187F7 ; Tangut # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 -18800..18AFF ; Tangut # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 -18D00..18D08 ; Tangut # Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08 +17000..18AFF ; Tangut # Lo [6912] TANGUT IDEOGRAPH-17000..TANGUT COMPONENT-768 +18D00..18D1E ; Tangut # Lo [31] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D1E +18D80..18DF2 ; Tangut # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 -# Total code points: 6914 +# Total code points: 7059 # ================================================ @@ -3125,4 +3141,42 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI # Total code points: 80 +# ================================================ + +10940..10959 ; Sidetic # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 + +# Total code points: 26 + +# ================================================ + +1E6C0..1E6DE ; Tai_Yo # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO +1E6E0..1E6E2 ; Tai_Yo # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE +1E6E3 ; Tai_Yo # Mn TAI YO SIGN UE +1E6E4..1E6E5 ; Tai_Yo # Lo [2] TAI YO LETTER U..TAI YO LETTER AE +1E6E6 ; Tai_Yo # Mn TAI YO SIGN AU +1E6E7..1E6ED ; Tai_Yo # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE +1E6EE..1E6EF ; Tai_Yo # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG +1E6F0..1E6F4 ; Tai_Yo # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP +1E6F5 ; Tai_Yo # Mn TAI YO SIGN OM +1E6FE ; Tai_Yo # Lo TAI YO SYMBOL MUEANG +1E6FF ; Tai_Yo # Lm TAI YO XAM LAI + +# Total code points: 55 + +# ================================================ + +11DB0..11DD8 ; Tolong_Siki # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH +11DD9 ; Tolong_Siki # Lm TOLONG SIKI SIGN SELA +11DDA..11DDB ; Tolong_Siki # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA +11DE0..11DE9 ; Tolong_Siki # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE + +# Total code points: 54 + +# ================================================ + +16EA0..16EB8 ; Beria_Erfe # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY +16EBB..16ED3 ; Beria_Erfe # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY + +# Total code points: 50 + # EOF diff --git a/share/ucd/UnicodeData.txt b/share/ucd/UnicodeData.txt index 64258a373..fca68e3e1 100644 --- a/share/ucd/UnicodeData.txt +++ b/share/ucd/UnicodeData.txt @@ -659,7 +659,7 @@ 0292;LATIN SMALL LETTER EZH;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH;;01B7;;01B7 0293;LATIN SMALL LETTER EZH WITH CURL;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH CURL;;;; 0294;LATIN LETTER GLOTTAL STOP;Lo;0;L;;;;;N;;;;; -0295;LATIN LETTER PHARYNGEAL VOICED FRICATIVE;Ll;0;L;;;;;N;LATIN LETTER REVERSED GLOTTAL STOP;;;; +0295;LATIN LETTER PHARYNGEAL VOICED FRICATIVE;Lo;0;L;;;;;N;LATIN LETTER REVERSED GLOTTAL STOP;;;; 0296;LATIN LETTER INVERTED GLOTTAL STOP;Ll;0;L;;;;;N;;;;; 0297;LATIN LETTER STRETCHED C;Ll;0;L;;;;;N;;;;; 0298;LATIN LETTER BILABIAL CLICK;Ll;0;L;;;;;N;LATIN LETTER BULLSEYE;;;; @@ -2121,6 +2121,7 @@ 088C;ARABIC LETTER TAH WITH THREE DOTS BELOW;Lo;0;AL;;;;;N;;;;; 088D;ARABIC LETTER KEHEH WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; 088E;ARABIC VERTICAL TAIL;Lo;0;AL;;;;;N;;;;; +088F;ARABIC LETTER NOON WITH RING ABOVE;Lo;0;AL;;;;;N;;;;; 0890;ARABIC POUND MARK ABOVE;Cf;0;AN;;;;;N;;;;; 0891;ARABIC PIASTRE MARK ABOVE;Cf;0;AN;;;;;N;;;;; 0897;ARABIC PEPET;Mn;230;NSM;;;;;N;;;;; @@ -2862,6 +2863,7 @@ 0C58;TELUGU LETTER TSA;Lo;0;L;;;;;N;;;;; 0C59;TELUGU LETTER DZA;Lo;0;L;;;;;N;;;;; 0C5A;TELUGU LETTER RRRA;Lo;0;L;;;;;N;;;;; +0C5C;TELUGU ARCHAIC SHRII;Lo;0;L;;;;;N;;;;; 0C5D;TELUGU LETTER NAKAARA POLLU;Lo;0;L;;;;;N;;;;; 0C60;TELUGU LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; 0C61;TELUGU LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;; @@ -2958,6 +2960,7 @@ 0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; 0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;; 0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;; +0CDC;KANNADA ARCHAIC SHRII;Lo;0;L;;;;;N;;;;; 0CDD;KANNADA LETTER NAKAARA POLLU;Lo;0;L;;;;;N;;;;; 0CDE;KANNADA LETTER FA;Lo;0;L;;;;;N;;;;; 0CE0;KANNADA LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; @@ -6137,6 +6140,33 @@ 1ACC;COMBINING LATIN SMALL LETTER INSULAR G;Mn;230;NSM;;;;;N;;;;; 1ACD;COMBINING LATIN SMALL LETTER INSULAR R;Mn;230;NSM;;;;;N;;;;; 1ACE;COMBINING LATIN SMALL LETTER INSULAR T;Mn;230;NSM;;;;;N;;;;; +1ACF;COMBINING DOUBLE CARON;Mn;230;NSM;;;;;N;;;;; +1AD0;COMBINING VERTICAL-LINE-ACUTE;Mn;230;NSM;;;;;N;;;;; +1AD1;COMBINING GRAVE-VERTICAL-LINE;Mn;230;NSM;;;;;N;;;;; +1AD2;COMBINING VERTICAL-LINE-GRAVE;Mn;230;NSM;;;;;N;;;;; +1AD3;COMBINING ACUTE-VERTICAL-LINE;Mn;230;NSM;;;;;N;;;;; +1AD4;COMBINING VERTICAL-LINE-MACRON;Mn;230;NSM;;;;;N;;;;; +1AD5;COMBINING MACRON-VERTICAL-LINE;Mn;230;NSM;;;;;N;;;;; +1AD6;COMBINING VERTICAL-LINE-ACUTE-GRAVE;Mn;230;NSM;;;;;N;;;;; +1AD7;COMBINING VERTICAL-LINE-GRAVE-ACUTE;Mn;230;NSM;;;;;N;;;;; +1AD8;COMBINING MACRON-ACUTE-GRAVE;Mn;230;NSM;;;;;N;;;;; +1AD9;COMBINING SHARP SIGN;Mn;230;NSM;;;;;N;;;;; +1ADA;COMBINING FLAT SIGN;Mn;230;NSM;;;;;N;;;;; +1ADB;COMBINING DOWN TACK ABOVE;Mn;230;NSM;;;;;N;;;;; +1ADC;COMBINING DIAERESIS WITH RAISED LEFT DOT;Mn;230;NSM;;;;;N;;;;; +1ADD;COMBINING DOT-AND-RING BELOW;Mn;220;NSM;;;;;N;;;;; +1AE0;COMBINING LEFT TACK ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE1;COMBINING RIGHT TACK ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE2;COMBINING MINUS SIGN ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE3;COMBINING INVERTED BRIDGE ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE4;COMBINING SQUARE ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE5;COMBINING SEAGULL ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE6;COMBINING DOUBLE ARCH BELOW;Mn;220;NSM;;;;;N;;;;; +1AE7;COMBINING DOUBLE ARCH ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE8;COMBINING EQUALS SIGN ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE9;COMBINING LEFT ANGLE CENTRED ABOVE;Mn;230;NSM;;;;;N;;;;; +1AEA;COMBINING UPWARDS ARROW ABOVE;Mn;230;NSM;;;;;N;;;;; +1AEB;COMBINING DOUBLE RIGHTWARDS ARROW ABOVE;Mn;234;NSM;;;;;N;;;;; 1B00;BALINESE SIGN ULU RICEM;Mn;0;NSM;;;;;N;;;;; 1B01;BALINESE SIGN ULU CANDRA;Mn;0;NSM;;;;;N;;;;; 1B02;BALINESE SIGN CECEK;Mn;0;NSM;;;;;N;;;;; @@ -7545,6 +7575,7 @@ 20BE;LARI SIGN;Sc;0;ET;;;;;N;;;;; 20BF;BITCOIN SIGN;Sc;0;ET;;;;;N;;;;; 20C0;SOM SIGN;Sc;0;ET;;;;;N;;;;; +20C1;SAUDI RIYAL SIGN;Sc;0;ET;;;;;N;;;;; 20D0;COMBINING LEFT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING LEFT HARPOON ABOVE;;;; 20D1;COMBINING RIGHT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING RIGHT HARPOON ABOVE;;;; 20D2;COMBINING LONG VERTICAL LINE OVERLAY;Mn;1;NSM;;;;;N;NON-SPACING LONG VERTICAL BAR OVERLAY;;;; @@ -10239,6 +10270,7 @@ 2B93;NEWLINE RIGHT;So;0;ON;;;;;N;;;;; 2B94;FOUR CORNER ARROWS CIRCLING ANTICLOCKWISE;So;0;ON;;;;;N;;;;; 2B95;RIGHTWARDS BLACK ARROW;So;0;ON;;;;;N;;;;; +2B96;EQUALS SIGN WITH INFINITY ABOVE;So;0;ON;;;;;N;;;;; 2B97;SYMBOL FOR TYPE A ELECTRONICS;So;0;ON;;;;;N;;;;; 2B98;THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; 2B99;THREE-D RIGHT-LIGHTED UPWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; @@ -14274,10 +14306,14 @@ A7CA;LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY;Ll;0;L;;;;;N;;;A7C9;;A7C9 A7CB;LATIN CAPITAL LETTER RAMS HORN;Lu;0;L;;;;;N;;;;0264; A7CC;LATIN CAPITAL LETTER S WITH DIAGONAL STROKE;Lu;0;L;;;;;N;;;;A7CD; A7CD;LATIN SMALL LETTER S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;A7CC;;A7CC +A7CE;LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE;Lu;0;L;;;;;N;;;;A7CF; +A7CF;LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE;Ll;0;L;;;;;N;;;A7CE;;A7CE A7D0;LATIN CAPITAL LETTER CLOSED INSULAR G;Lu;0;L;;;;;N;;;;A7D1; A7D1;LATIN SMALL LETTER CLOSED INSULAR G;Ll;0;L;;;;;N;;;A7D0;;A7D0 -A7D3;LATIN SMALL LETTER DOUBLE THORN;Ll;0;L;;;;;N;;;;; -A7D5;LATIN SMALL LETTER DOUBLE WYNN;Ll;0;L;;;;;N;;;;; +A7D2;LATIN CAPITAL LETTER DOUBLE THORN;Lu;0;L;;;;;N;;;;A7D3; +A7D3;LATIN SMALL LETTER DOUBLE THORN;Ll;0;L;;;;;N;;;A7D2;;A7D2 +A7D4;LATIN CAPITAL LETTER DOUBLE WYNN;Lu;0;L;;;;;N;;;;A7D5; +A7D5;LATIN SMALL LETTER DOUBLE WYNN;Ll;0;L;;;;;N;;;A7D4;;A7D4 A7D6;LATIN CAPITAL LETTER MIDDLE SCOTS S;Lu;0;L;;;;;N;;;;A7D7; A7D7;LATIN SMALL LETTER MIDDLE SCOTS S;Ll;0;L;;;;;N;;;A7D6;;A7D6 A7D8;LATIN CAPITAL LETTER SIGMOID S;Lu;0;L;;;;;N;;;;A7D9; @@ -14285,6 +14321,7 @@ A7D9;LATIN SMALL LETTER SIGMOID S;Ll;0;L;;;;;N;;;A7D8;;A7D8 A7DA;LATIN CAPITAL LETTER LAMBDA;Lu;0;L;;;;;N;;;;A7DB; A7DB;LATIN SMALL LETTER LAMBDA;Ll;0;L;;;;;N;;;A7DA;;A7DA A7DC;LATIN CAPITAL LETTER LAMBDA WITH STROKE;Lu;0;L;;;;;N;;;;019B; +A7F1;MODIFIER LETTER CAPITAL S;Lm;0;L; 0053;;;;N;;;;; A7F2;MODIFIER LETTER CAPITAL C;Lm;0;L; 0043;;;;N;;;;; A7F3;MODIFIER LETTER CAPITAL F;Lm;0;L; 0046;;;;N;;;;; A7F4;MODIFIER LETTER CAPITAL Q;Lm;0;L; 0051;;;;N;;;;; @@ -15925,6 +15962,22 @@ FBBF;ARABIC SYMBOL RING;Sk;0;AL;;;;;N;;;;; FBC0;ARABIC SYMBOL SMALL TAH ABOVE;Sk;0;AL;;;;;N;;;;; FBC1;ARABIC SYMBOL SMALL TAH BELOW;Sk;0;AL;;;;;N;;;;; FBC2;ARABIC SYMBOL WASLA ABOVE;Sk;0;AL;;;;;N;;;;; +FBC3;ARABIC LIGATURE JALLA WA-ALAA;So;0;ON;;;;;N;;;;; +FBC4;ARABIC LIGATURE DAAMAT BARAKAATUHUM;So;0;ON;;;;;N;;;;; +FBC5;ARABIC LIGATURE RAHMATU ALLAAHI TAAALAA ALAYH;So;0;ON;;;;;N;;;;; +FBC6;ARABIC LIGATURE RAHMATU ALLAAHI ALAYHIM;So;0;ON;;;;;N;;;;; +FBC7;ARABIC LIGATURE RAHMATU ALLAAHI ALAYHIMAA;So;0;ON;;;;;N;;;;; +FBC8;ARABIC LIGATURE RAHIMAHUM ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBC9;ARABIC LIGATURE RAHIMAHUMAA ALLAAH;So;0;ON;;;;;N;;;;; +FBCA;ARABIC LIGATURE RAHIMAHUMAA ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBCB;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANHUM;So;0;ON;;;;;N;;;;; +FBCC;ARABIC LIGATURE HAFIZAHU ALLAAH;So;0;ON;;;;;N;;;;; +FBCD;ARABIC LIGATURE HAFIZAHU ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBCE;ARABIC LIGATURE HAFIZAHUM ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBCF;ARABIC LIGATURE HAFIZAHUMAA ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBD0;ARABIC LIGATURE SALLALLAAHU TAAALAA ALAYHI WA-SALLAM;So;0;ON;;;;;N;;;;; +FBD1;ARABIC LIGATURE AJJAL ALLAAHU FARAJAHU ASH-SHAREEF;So;0;ON;;;;;N;;;;; +FBD2;ARABIC LIGATURE ALAYHI AR-RAHMAH;So;0;ON;;;;;N;;;;; FBD3;ARABIC LETTER NG ISOLATED FORM;Lo;0;AL; 06AD;;;;N;;;;; FBD4;ARABIC LETTER NG FINAL FORM;Lo;0;AL; 06AD;;;;N;;;;; FBD5;ARABIC LETTER NG INITIAL FORM;Lo;0;AL; 06AD;;;;N;;;;; @@ -16370,6 +16423,8 @@ FD8C;ARABIC LIGATURE MEEM WITH JEEM WITH HAH INITIAL FORM;Lo;0;AL; 0645 FD8D;ARABIC LIGATURE MEEM WITH JEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0645 062C 0645;;;;N;;;;; FD8E;ARABIC LIGATURE MEEM WITH KHAH WITH JEEM INITIAL FORM;Lo;0;AL; 0645 062E 062C;;;;N;;;;; FD8F;ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM;Lo;0;AL; 0645 062E 0645;;;;N;;;;; +FD90;ARABIC LIGATURE RAHMATU ALLAAHI ALAYH;So;0;ON;;;;;N;;;;; +FD91;ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA;So;0;ON;;;;;N;;;;; FD92;ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM;Lo;0;AL; 0645 062C 062E;;;;N;;;;; FD93;ARABIC LIGATURE HEH WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL; 0647 0645 062C;;;;N;;;;; FD94;ARABIC LIGATURE HEH WITH MEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0647 0645 0645;;;;N;;;;; @@ -16424,6 +16479,13 @@ FDC4;ARABIC LIGATURE AIN WITH JEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0639 FDC5;ARABIC LIGATURE SAD WITH MEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0635 0645 0645;;;;N;;;;; FDC6;ARABIC LIGATURE SEEN WITH KHAH WITH YEH FINAL FORM;Lo;0;AL; 0633 062E 064A;;;;N;;;;; FDC7;ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM;Lo;0;AL; 0646 062C 064A;;;;N;;;;; +FDC8;ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA;So;0;ON;;;;;N;;;;; +FDC9;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANH;So;0;ON;;;;;N;;;;; +FDCA;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANHAA;So;0;ON;;;;;N;;;;; +FDCB;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANHUMAA;So;0;ON;;;;;N;;;;; +FDCC;ARABIC LIGATURE SALLALLAHU ALAYHI WA-ALAA AALIHEE WA-SALLAM;So;0;ON;;;;;N;;;;; +FDCD;ARABIC LIGATURE AJJAL ALLAAHU TAAALAA FARAJAHU ASH-SHAREEF;So;0;ON;;;;;N;;;;; +FDCE;ARABIC LIGATURE KARRAMA ALLAAHU WAJHAH;So;0;ON;;;;;N;;;;; FDCF;ARABIC LIGATURE SALAAMUHU ALAYNAA;So;0;ON;;;;;N;;;;; FDF0;ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM;Lo;0;AL; 0635 0644 06D2;;;;N;;;;; FDF1;ARABIC LIGATURE QALA USED AS KORANIC STOP SIGN ISOLATED FORM;Lo;0;AL; 0642 0644 06D2;;;;N;;;;; @@ -18708,6 +18770,32 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 10938;LYDIAN LETTER NN;Lo;0;R;;;;;N;;;;; 10939;LYDIAN LETTER C;Lo;0;R;;;;;N;;;;; 1093F;LYDIAN TRIANGULAR MARK;Po;0;R;;;;;N;;;;; +10940;SIDETIC LETTER N01;Lo;0;R;;;;;N;;;;; +10941;SIDETIC LETTER N02;Lo;0;R;;;;;N;;;;; +10942;SIDETIC LETTER N03;Lo;0;R;;;;;N;;;;; +10943;SIDETIC LETTER N04;Lo;0;R;;;;;N;;;;; +10944;SIDETIC LETTER N05;Lo;0;R;;;;;N;;;;; +10945;SIDETIC LETTER N06;Lo;0;R;;;;;N;;;;; +10946;SIDETIC LETTER N07;Lo;0;R;;;;;N;;;;; +10947;SIDETIC LETTER N08;Lo;0;R;;;;;N;;;;; +10948;SIDETIC LETTER N09;Lo;0;R;;;;;N;;;;; +10949;SIDETIC LETTER N10;Lo;0;R;;;;;N;;;;; +1094A;SIDETIC LETTER N11;Lo;0;R;;;;;N;;;;; +1094B;SIDETIC LETTER N12;Lo;0;R;;;;;N;;;;; +1094C;SIDETIC LETTER N13;Lo;0;R;;;;;N;;;;; +1094D;SIDETIC LETTER N14;Lo;0;R;;;;;N;;;;; +1094E;SIDETIC LETTER N15;Lo;0;R;;;;;N;;;;; +1094F;SIDETIC LETTER N16;Lo;0;R;;;;;N;;;;; +10950;SIDETIC LETTER N17;Lo;0;R;;;;;N;;;;; +10951;SIDETIC LETTER N18;Lo;0;R;;;;;N;;;;; +10952;SIDETIC LETTER N19;Lo;0;R;;;;;N;;;;; +10953;SIDETIC LETTER N20;Lo;0;R;;;;;N;;;;; +10954;SIDETIC LETTER N21;Lo;0;R;;;;;N;;;;; +10955;SIDETIC LETTER N22;Lo;0;R;;;;;N;;;;; +10956;SIDETIC LETTER N23;Lo;0;R;;;;;N;;;;; +10957;SIDETIC LETTER N24;Lo;0;R;;;;;N;;;;; +10958;SIDETIC LETTER N25;Lo;0;R;;;;;N;;;;; +10959;SIDETIC LETTER N26;Lo;0;R;;;;;N;;;;; 10980;MEROITIC HIEROGLYPHIC LETTER A;Lo;0;R;;;;;N;;;;; 10981;MEROITIC HIEROGLYPHIC LETTER E;Lo;0;R;;;;;N;;;;; 10982;MEROITIC HIEROGLYPHIC LETTER I;Lo;0;R;;;;;N;;;;; @@ -19541,6 +19629,20 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 10EC2;ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; 10EC3;ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; 10EC4;ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; +10EC5;ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW;Lm;0;AL;;;;;N;;;;; +10EC6;ARABIC LETTER THIN NOON;Lo;0;AL;;;;;N;;;;; +10EC7;ARABIC LETTER YEH WITH FOUR DOTS BELOW;Lo;0;AL;;;;;N;;;;; +10ED0;ARABIC BIBLICAL END OF VERSE;Po;0;ON;;;;;N;;;;; +10ED1;ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM;So;0;ON;;;;;N;;;;; +10ED2;ARABIC LIGATURE ALAYHIM AS-SALAATU WAS-SALAAM;So;0;ON;;;;;N;;;;; +10ED3;ARABIC LIGATURE ALAYHIMAA AS-SALAATU WAS-SALAAM;So;0;ON;;;;;N;;;;; +10ED4;ARABIC LIGATURE QADDASA ALLAAHU SIRRAH;So;0;ON;;;;;N;;;;; +10ED5;ARABIC LIGATURE QUDDISA SIRRUHUM;So;0;ON;;;;;N;;;;; +10ED6;ARABIC LIGATURE QUDDISA SIRRUHUMAA;So;0;ON;;;;;N;;;;; +10ED7;ARABIC LIGATURE QUDDISAT ASRAARUHUM;So;0;ON;;;;;N;;;;; +10ED8;ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH;So;0;ON;;;;;N;;;;; +10EFA;ARABIC DOUBLE VERTICAL BAR BELOW;Mn;220;NSM;;;;;N;;;;; +10EFB;ARABIC SMALL LOW NOON;Mn;220;NSM;;;;;N;;;;; 10EFC;ARABIC COMBINING ALEF OVERLAY;Mn;0;NSM;;;;;N;;;;; 10EFD;ARABIC SMALL LOW WORD SAKTA;Mn;220;NSM;;;;;N;;;;; 10EFE;ARABIC SMALL LOW WORD QASR;Mn;220;NSM;;;;;N;;;;; @@ -21521,6 +21623,14 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 11B07;DEVANAGARI SIGN WESTERN NINE-LIKE BHALE;Po;0;L;;;;;N;;;;; 11B08;DEVANAGARI SIGN REVERSED NINE-LIKE BHALE;Po;0;L;;;;;N;;;;; 11B09;DEVANAGARI SIGN MINDU;Po;0;L;;;;;N;;;;; +11B60;SHARADA VOWEL SIGN OE;Mn;0;NSM;;;;;N;;;;; +11B61;SHARADA VOWEL SIGN OOE;Mc;0;L;;;;;N;;;;; +11B62;SHARADA VOWEL SIGN UE;Mn;0;NSM;;;;;N;;;;; +11B63;SHARADA VOWEL SIGN UUE;Mn;0;NSM;;;;;N;;;;; +11B64;SHARADA VOWEL SIGN SHORT E;Mn;0;NSM;;;;;N;;;;; +11B65;SHARADA VOWEL SIGN SHORT O;Mc;0;L;;;;;N;;;;; +11B66;SHARADA VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;; +11B67;SHARADA VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;; 11BC0;SUNUWAR LETTER DEVI;Lo;0;L;;;;;N;;;;; 11BC1;SUNUWAR LETTER TASLA;Lo;0;L;;;;;N;;;;; 11BC2;SUNUWAR LETTER EKO;Lo;0;L;;;;;N;;;;; @@ -21868,6 +21978,60 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 11DA7;GUNJALA GONDI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; 11DA8;GUNJALA GONDI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 11DA9;GUNJALA GONDI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +11DB0;TOLONG SIKI LETTER I;Lo;0;L;;;;;N;;;;; +11DB1;TOLONG SIKI LETTER E;Lo;0;L;;;;;N;;;;; +11DB2;TOLONG SIKI LETTER U;Lo;0;L;;;;;N;;;;; +11DB3;TOLONG SIKI LETTER O;Lo;0;L;;;;;N;;;;; +11DB4;TOLONG SIKI LETTER A;Lo;0;L;;;;;N;;;;; +11DB5;TOLONG SIKI LETTER AA;Lo;0;L;;;;;N;;;;; +11DB6;TOLONG SIKI LETTER P;Lo;0;L;;;;;N;;;;; +11DB7;TOLONG SIKI LETTER PH;Lo;0;L;;;;;N;;;;; +11DB8;TOLONG SIKI LETTER B;Lo;0;L;;;;;N;;;;; +11DB9;TOLONG SIKI LETTER BH;Lo;0;L;;;;;N;;;;; +11DBA;TOLONG SIKI LETTER M;Lo;0;L;;;;;N;;;;; +11DBB;TOLONG SIKI LETTER T;Lo;0;L;;;;;N;;;;; +11DBC;TOLONG SIKI LETTER TH;Lo;0;L;;;;;N;;;;; +11DBD;TOLONG SIKI LETTER D;Lo;0;L;;;;;N;;;;; +11DBE;TOLONG SIKI LETTER DH;Lo;0;L;;;;;N;;;;; +11DBF;TOLONG SIKI LETTER N;Lo;0;L;;;;;N;;;;; +11DC0;TOLONG SIKI LETTER TT;Lo;0;L;;;;;N;;;;; +11DC1;TOLONG SIKI LETTER TTH;Lo;0;L;;;;;N;;;;; +11DC2;TOLONG SIKI LETTER DD;Lo;0;L;;;;;N;;;;; +11DC3;TOLONG SIKI LETTER DDH;Lo;0;L;;;;;N;;;;; +11DC4;TOLONG SIKI LETTER NN;Lo;0;L;;;;;N;;;;; +11DC5;TOLONG SIKI LETTER C;Lo;0;L;;;;;N;;;;; +11DC6;TOLONG SIKI LETTER CH;Lo;0;L;;;;;N;;;;; +11DC7;TOLONG SIKI LETTER J;Lo;0;L;;;;;N;;;;; +11DC8;TOLONG SIKI LETTER JH;Lo;0;L;;;;;N;;;;; +11DC9;TOLONG SIKI LETTER NY;Lo;0;L;;;;;N;;;;; +11DCA;TOLONG SIKI LETTER K;Lo;0;L;;;;;N;;;;; +11DCB;TOLONG SIKI LETTER KH;Lo;0;L;;;;;N;;;;; +11DCC;TOLONG SIKI LETTER G;Lo;0;L;;;;;N;;;;; +11DCD;TOLONG SIKI LETTER GH;Lo;0;L;;;;;N;;;;; +11DCE;TOLONG SIKI LETTER NG;Lo;0;L;;;;;N;;;;; +11DCF;TOLONG SIKI LETTER Y;Lo;0;L;;;;;N;;;;; +11DD0;TOLONG SIKI LETTER R;Lo;0;L;;;;;N;;;;; +11DD1;TOLONG SIKI LETTER L;Lo;0;L;;;;;N;;;;; +11DD2;TOLONG SIKI LETTER V;Lo;0;L;;;;;N;;;;; +11DD3;TOLONG SIKI LETTER NNY;Lo;0;L;;;;;N;;;;; +11DD4;TOLONG SIKI LETTER S;Lo;0;L;;;;;N;;;;; +11DD5;TOLONG SIKI LETTER H;Lo;0;L;;;;;N;;;;; +11DD6;TOLONG SIKI LETTER X;Lo;0;L;;;;;N;;;;; +11DD7;TOLONG SIKI LETTER RR;Lo;0;L;;;;;N;;;;; +11DD8;TOLONG SIKI LETTER RRH;Lo;0;L;;;;;N;;;;; +11DD9;TOLONG SIKI SIGN SELA;Lm;0;L;;;;;N;;;;; +11DDA;TOLONG SIKI SIGN HECAKA;Lo;0;L;;;;;N;;;;; +11DDB;TOLONG SIKI UNGGA;Lo;0;L;;;;;N;;;;; +11DE0;TOLONG SIKI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +11DE1;TOLONG SIKI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +11DE2;TOLONG SIKI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +11DE3;TOLONG SIKI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +11DE4;TOLONG SIKI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +11DE5;TOLONG SIKI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +11DE6;TOLONG SIKI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +11DE7;TOLONG SIKI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +11DE8;TOLONG SIKI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +11DE9;TOLONG SIKI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 11EE0;MAKASAR LETTER KA;Lo;0;L;;;;;N;;;;; 11EE1;MAKASAR LETTER GA;Lo;0;L;;;;;N;;;;; 11EE2;MAKASAR LETTER NGA;Lo;0;L;;;;;N;;;;; @@ -22088,8 +22252,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12035;CUNEIFORM SIGN ARAD TIMES KUR;Lo;0;L;;;;;N;;;;; 12036;CUNEIFORM SIGN ARKAB;Lo;0;L;;;;;N;;;;; 12037;CUNEIFORM SIGN ASAL2;Lo;0;L;;;;;N;;;;; -12038;CUNEIFORM SIGN ASH;Lo;0;L;;;;;N;;;;; -12039;CUNEIFORM SIGN ASH ZIDA TENU;Lo;0;L;;;;;N;;;;; +12038;CUNEIFORM SIGN ASH;Lo;0;L;;;;1;N;;;;; +12039;CUNEIFORM SIGN ASH ZIDA TENU;Lo;0;L;;;;1;N;;;;; 1203A;CUNEIFORM SIGN ASH KABA TENU;Lo;0;L;;;;;N;;;;; 1203B;CUNEIFORM SIGN ASH OVER ASH TUG2 OVER TUG2 TUG2 OVER TUG2 PAP;Lo;0;L;;;;;N;;;;; 1203C;CUNEIFORM SIGN ASH OVER ASH OVER ASH;Lo;0;L;;;;;N;;;;; @@ -22153,7 +22317,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12076;CUNEIFORM SIGN DIM2;Lo;0;L;;;;;N;;;;; 12077;CUNEIFORM SIGN DIN;Lo;0;L;;;;;N;;;;; 12078;CUNEIFORM SIGN DIN KASKAL U GUNU DISH;Lo;0;L;;;;;N;;;;; -12079;CUNEIFORM SIGN DISH;Lo;0;L;;;;;N;;;;; +12079;CUNEIFORM SIGN DISH;Lo;0;L;;;;1;N;;;;; 1207A;CUNEIFORM SIGN DU;Lo;0;L;;;;;N;;;;; 1207B;CUNEIFORM SIGN DU OVER DU;Lo;0;L;;;;;N;;;;; 1207C;CUNEIFORM SIGN DU GUNU;Lo;0;L;;;;;N;;;;; @@ -22582,12 +22746,12 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12223;CUNEIFORM SIGN MA2;Lo;0;L;;;;;N;;;;; 12224;CUNEIFORM SIGN MAH;Lo;0;L;;;;;N;;;;; 12225;CUNEIFORM SIGN MAR;Lo;0;L;;;;;N;;;;; -12226;CUNEIFORM SIGN MASH;Lo;0;L;;;;;N;;;;; +12226;CUNEIFORM SIGN MASH;Lo;0;L;;;;1/2;N;;;;; 12227;CUNEIFORM SIGN MASH2;Lo;0;L;;;;;N;;;;; 12228;CUNEIFORM SIGN ME;Lo;0;L;;;;;N;;;;; 12229;CUNEIFORM SIGN MES;Lo;0;L;;;;;N;;;;; 1222A;CUNEIFORM SIGN MI;Lo;0;L;;;;;N;;;;; -1222B;CUNEIFORM SIGN MIN;Lo;0;L;;;;;N;;;;; +1222B;CUNEIFORM SIGN MIN;Lo;0;L;;;;2;N;;;;; 1222C;CUNEIFORM SIGN MU;Lo;0;L;;;;;N;;;;; 1222D;CUNEIFORM SIGN MU OVER MU;Lo;0;L;;;;;N;;;;; 1222E;CUNEIFORM SIGN MUG;Lo;0;L;;;;;N;;;;; @@ -22811,9 +22975,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12308;CUNEIFORM SIGN TUM;Lo;0;L;;;;;N;;;;; 12309;CUNEIFORM SIGN TUR;Lo;0;L;;;;;N;;;;; 1230A;CUNEIFORM SIGN TUR OVER TUR ZA OVER ZA;Lo;0;L;;;;;N;;;;; -1230B;CUNEIFORM SIGN U;Lo;0;L;;;;;N;;;;; +1230B;CUNEIFORM SIGN U;Lo;0;L;;;;1;N;;;;; 1230C;CUNEIFORM SIGN U GUD;Lo;0;L;;;;;N;;;;; -1230D;CUNEIFORM SIGN U U U;Lo;0;L;;;;;N;;;;; +1230D;CUNEIFORM SIGN U U U;Lo;0;L;;;;3;N;;;;; 1230E;CUNEIFORM SIGN U OVER U PA OVER PA GAR OVER GAR;Lo;0;L;;;;;N;;;;; 1230F;CUNEIFORM SIGN U OVER U SUR OVER SUR;Lo;0;L;;;;;N;;;;; 12310;CUNEIFORM SIGN U OVER U U REVERSED OVER U REVERSED;Lo;0;L;;;;;N;;;;; @@ -22953,7 +23117,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12396;CUNEIFORM SIGN SAG TIMES IGI GUNU;Lo;0;L;;;;;N;;;;; 12397;CUNEIFORM SIGN TI2;Lo;0;L;;;;;N;;;;; 12398;CUNEIFORM SIGN UM TIMES ME;Lo;0;L;;;;;N;;;;; -12399;CUNEIFORM SIGN U U;Lo;0;L;;;;;N;;;;; +12399;CUNEIFORM SIGN U U;Lo;0;L;;;;2;N;;;;; 12400;CUNEIFORM NUMERIC SIGN TWO ASH;Nl;0;L;;;;2;N;;;;; 12401;CUNEIFORM NUMERIC SIGN THREE ASH;Nl;0;L;;;;3;N;;;;; 12402;CUNEIFORM NUMERIC SIGN FOUR ASH;Nl;0;L;;;;4;N;;;;; @@ -30124,6 +30288,56 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 16E98;MEDEFAIDRIN FULL STOP;Po;0;L;;;;;N;;;;; 16E99;MEDEFAIDRIN SYMBOL AIVA;Po;0;L;;;;;N;;;;; 16E9A;MEDEFAIDRIN EXCLAMATION OH;Po;0;L;;;;;N;;;;; +16EA0;BERIA ERFE CAPITAL LETTER ARKAB;Lu;0;L;;;;;N;;;;16EBB; +16EA1;BERIA ERFE CAPITAL LETTER BASIGNA;Lu;0;L;;;;;N;;;;16EBC; +16EA2;BERIA ERFE CAPITAL LETTER DARBAI;Lu;0;L;;;;;N;;;;16EBD; +16EA3;BERIA ERFE CAPITAL LETTER EH;Lu;0;L;;;;;N;;;;16EBE; +16EA4;BERIA ERFE CAPITAL LETTER FITKO;Lu;0;L;;;;;N;;;;16EBF; +16EA5;BERIA ERFE CAPITAL LETTER GOWAY;Lu;0;L;;;;;N;;;;16EC0; +16EA6;BERIA ERFE CAPITAL LETTER HIRDEABO;Lu;0;L;;;;;N;;;;16EC1; +16EA7;BERIA ERFE CAPITAL LETTER I;Lu;0;L;;;;;N;;;;16EC2; +16EA8;BERIA ERFE CAPITAL LETTER DJAI;Lu;0;L;;;;;N;;;;16EC3; +16EA9;BERIA ERFE CAPITAL LETTER KOBO;Lu;0;L;;;;;N;;;;16EC4; +16EAA;BERIA ERFE CAPITAL LETTER LAKKO;Lu;0;L;;;;;N;;;;16EC5; +16EAB;BERIA ERFE CAPITAL LETTER MERI;Lu;0;L;;;;;N;;;;16EC6; +16EAC;BERIA ERFE CAPITAL LETTER NINI;Lu;0;L;;;;;N;;;;16EC7; +16EAD;BERIA ERFE CAPITAL LETTER GNA;Lu;0;L;;;;;N;;;;16EC8; +16EAE;BERIA ERFE CAPITAL LETTER NGAY;Lu;0;L;;;;;N;;;;16EC9; +16EAF;BERIA ERFE CAPITAL LETTER OI;Lu;0;L;;;;;N;;;;16ECA; +16EB0;BERIA ERFE CAPITAL LETTER PI;Lu;0;L;;;;;N;;;;16ECB; +16EB1;BERIA ERFE CAPITAL LETTER ERIGO;Lu;0;L;;;;;N;;;;16ECC; +16EB2;BERIA ERFE CAPITAL LETTER ERIGO TAMURA;Lu;0;L;;;;;N;;;;16ECD; +16EB3;BERIA ERFE CAPITAL LETTER SERI;Lu;0;L;;;;;N;;;;16ECE; +16EB4;BERIA ERFE CAPITAL LETTER SHEP;Lu;0;L;;;;;N;;;;16ECF; +16EB5;BERIA ERFE CAPITAL LETTER TATASOUE;Lu;0;L;;;;;N;;;;16ED0; +16EB6;BERIA ERFE CAPITAL LETTER UI;Lu;0;L;;;;;N;;;;16ED1; +16EB7;BERIA ERFE CAPITAL LETTER WASSE;Lu;0;L;;;;;N;;;;16ED2; +16EB8;BERIA ERFE CAPITAL LETTER AY;Lu;0;L;;;;;N;;;;16ED3; +16EBB;BERIA ERFE SMALL LETTER ARKAB;Ll;0;L;;;;;N;;;16EA0;;16EA0 +16EBC;BERIA ERFE SMALL LETTER BASIGNA;Ll;0;L;;;;;N;;;16EA1;;16EA1 +16EBD;BERIA ERFE SMALL LETTER DARBAI;Ll;0;L;;;;;N;;;16EA2;;16EA2 +16EBE;BERIA ERFE SMALL LETTER EH;Ll;0;L;;;;;N;;;16EA3;;16EA3 +16EBF;BERIA ERFE SMALL LETTER FITKO;Ll;0;L;;;;;N;;;16EA4;;16EA4 +16EC0;BERIA ERFE SMALL LETTER GOWAY;Ll;0;L;;;;;N;;;16EA5;;16EA5 +16EC1;BERIA ERFE SMALL LETTER HIRDEABO;Ll;0;L;;;;;N;;;16EA6;;16EA6 +16EC2;BERIA ERFE SMALL LETTER I;Ll;0;L;;;;;N;;;16EA7;;16EA7 +16EC3;BERIA ERFE SMALL LETTER DJAI;Ll;0;L;;;;;N;;;16EA8;;16EA8 +16EC4;BERIA ERFE SMALL LETTER KOBO;Ll;0;L;;;;;N;;;16EA9;;16EA9 +16EC5;BERIA ERFE SMALL LETTER LAKKO;Ll;0;L;;;;;N;;;16EAA;;16EAA +16EC6;BERIA ERFE SMALL LETTER MERI;Ll;0;L;;;;;N;;;16EAB;;16EAB +16EC7;BERIA ERFE SMALL LETTER NINI;Ll;0;L;;;;;N;;;16EAC;;16EAC +16EC8;BERIA ERFE SMALL LETTER GNA;Ll;0;L;;;;;N;;;16EAD;;16EAD +16EC9;BERIA ERFE SMALL LETTER NGAY;Ll;0;L;;;;;N;;;16EAE;;16EAE +16ECA;BERIA ERFE SMALL LETTER OI;Ll;0;L;;;;;N;;;16EAF;;16EAF +16ECB;BERIA ERFE SMALL LETTER PI;Ll;0;L;;;;;N;;;16EB0;;16EB0 +16ECC;BERIA ERFE SMALL LETTER ERIGO;Ll;0;L;;;;;N;;;16EB1;;16EB1 +16ECD;BERIA ERFE SMALL LETTER ERIGO TAMURA;Ll;0;L;;;;;N;;;16EB2;;16EB2 +16ECE;BERIA ERFE SMALL LETTER SERI;Ll;0;L;;;;;N;;;16EB3;;16EB3 +16ECF;BERIA ERFE SMALL LETTER SHEP;Ll;0;L;;;;;N;;;16EB4;;16EB4 +16ED0;BERIA ERFE SMALL LETTER TATASOUE;Ll;0;L;;;;;N;;;16EB5;;16EB5 +16ED1;BERIA ERFE SMALL LETTER UI;Ll;0;L;;;;;N;;;16EB6;;16EB6 +16ED2;BERIA ERFE SMALL LETTER WASSE;Ll;0;L;;;;;N;;;16EB7;;16EB7 +16ED3;BERIA ERFE SMALL LETTER AY;Ll;0;L;;;;;N;;;16EB8;;16EB8 16F00;MIAO LETTER PA;Lo;0;L;;;;;N;;;;; 16F01;MIAO LETTER BA;Lo;0;L;;;;;N;;;;; 16F02;MIAO LETTER YI PA;Lo;0;L;;;;;N;;;;; @@ -30280,8 +30494,13 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 16FE4;KHITAN SMALL SCRIPT FILLER;Mn;0;NSM;;;;;N;;;;; 16FF0;VIETNAMESE ALTERNATE READING MARK CA;Mc;6;L;;;;;N;;;;; 16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;; +16FF2;CHINESE SMALL SIMPLIFIED ER;Lm;0;L;;;;;N;;;;; +16FF3;CHINESE SMALL TRADITIONAL ER;Lm;0;L;;;;;N;;;;; +16FF4;YANGQIN SIGN SLOW ONE BEAT;Nl;0;L;;;;1;N;;;;; +16FF5;YANGQIN SIGN SLOW THREE HALF BEATS;Nl;0;L;;;;3/2;N;;;;; +16FF6;YANGQIN SIGN SLOW TWO BEATS;Nl;0;L;;;;2;N;;;;; 17000;;Lo;0;L;;;;;N;;;;; -187F7;;Lo;0;L;;;;;N;;;;; +187FF;;Lo;0;L;;;;;N;;;;; 18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;; 18801;TANGUT COMPONENT-002;Lo;0;L;;;;;N;;;;; 18802;TANGUT COMPONENT-003;Lo;0;L;;;;;N;;;;; @@ -31522,7 +31741,122 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 18CD5;KHITAN SMALL SCRIPT CHARACTER-18CD5;Lo;0;L;;;;;N;;;;; 18CFF;KHITAN SMALL SCRIPT CHARACTER-18CFF;Lo;0;L;;;;;N;;;;; 18D00;;Lo;0;L;;;;;N;;;;; -18D08;;Lo;0;L;;;;;N;;;;; +18D1E;;Lo;0;L;;;;;N;;;;; +18D80;TANGUT COMPONENT-769;Lo;0;L;;;;;N;;;;; +18D81;TANGUT COMPONENT-770;Lo;0;L;;;;;N;;;;; +18D82;TANGUT COMPONENT-771;Lo;0;L;;;;;N;;;;; +18D83;TANGUT COMPONENT-772;Lo;0;L;;;;;N;;;;; +18D84;TANGUT COMPONENT-773;Lo;0;L;;;;;N;;;;; +18D85;TANGUT COMPONENT-774;Lo;0;L;;;;;N;;;;; +18D86;TANGUT COMPONENT-775;Lo;0;L;;;;;N;;;;; +18D87;TANGUT COMPONENT-776;Lo;0;L;;;;;N;;;;; +18D88;TANGUT COMPONENT-777;Lo;0;L;;;;;N;;;;; +18D89;TANGUT COMPONENT-778;Lo;0;L;;;;;N;;;;; +18D8A;TANGUT COMPONENT-779;Lo;0;L;;;;;N;;;;; +18D8B;TANGUT COMPONENT-780;Lo;0;L;;;;;N;;;;; +18D8C;TANGUT COMPONENT-781;Lo;0;L;;;;;N;;;;; +18D8D;TANGUT COMPONENT-782;Lo;0;L;;;;;N;;;;; +18D8E;TANGUT COMPONENT-783;Lo;0;L;;;;;N;;;;; +18D8F;TANGUT COMPONENT-784;Lo;0;L;;;;;N;;;;; +18D90;TANGUT COMPONENT-785;Lo;0;L;;;;;N;;;;; +18D91;TANGUT COMPONENT-786;Lo;0;L;;;;;N;;;;; +18D92;TANGUT COMPONENT-787;Lo;0;L;;;;;N;;;;; +18D93;TANGUT COMPONENT-788;Lo;0;L;;;;;N;;;;; +18D94;TANGUT COMPONENT-789;Lo;0;L;;;;;N;;;;; +18D95;TANGUT COMPONENT-790;Lo;0;L;;;;;N;;;;; +18D96;TANGUT COMPONENT-791;Lo;0;L;;;;;N;;;;; +18D97;TANGUT COMPONENT-792;Lo;0;L;;;;;N;;;;; +18D98;TANGUT COMPONENT-793;Lo;0;L;;;;;N;;;;; +18D99;TANGUT COMPONENT-794;Lo;0;L;;;;;N;;;;; +18D9A;TANGUT COMPONENT-795;Lo;0;L;;;;;N;;;;; +18D9B;TANGUT COMPONENT-796;Lo;0;L;;;;;N;;;;; +18D9C;TANGUT COMPONENT-797;Lo;0;L;;;;;N;;;;; +18D9D;TANGUT COMPONENT-798;Lo;0;L;;;;;N;;;;; +18D9E;TANGUT COMPONENT-799;Lo;0;L;;;;;N;;;;; +18D9F;TANGUT COMPONENT-800;Lo;0;L;;;;;N;;;;; +18DA0;TANGUT COMPONENT-801;Lo;0;L;;;;;N;;;;; +18DA1;TANGUT COMPONENT-802;Lo;0;L;;;;;N;;;;; +18DA2;TANGUT COMPONENT-803;Lo;0;L;;;;;N;;;;; +18DA3;TANGUT COMPONENT-804;Lo;0;L;;;;;N;;;;; +18DA4;TANGUT COMPONENT-805;Lo;0;L;;;;;N;;;;; +18DA5;TANGUT COMPONENT-806;Lo;0;L;;;;;N;;;;; +18DA6;TANGUT COMPONENT-807;Lo;0;L;;;;;N;;;;; +18DA7;TANGUT COMPONENT-808;Lo;0;L;;;;;N;;;;; +18DA8;TANGUT COMPONENT-809;Lo;0;L;;;;;N;;;;; +18DA9;TANGUT COMPONENT-810;Lo;0;L;;;;;N;;;;; +18DAA;TANGUT COMPONENT-811;Lo;0;L;;;;;N;;;;; +18DAB;TANGUT COMPONENT-812;Lo;0;L;;;;;N;;;;; +18DAC;TANGUT COMPONENT-813;Lo;0;L;;;;;N;;;;; +18DAD;TANGUT COMPONENT-814;Lo;0;L;;;;;N;;;;; +18DAE;TANGUT COMPONENT-815;Lo;0;L;;;;;N;;;;; +18DAF;TANGUT COMPONENT-816;Lo;0;L;;;;;N;;;;; +18DB0;TANGUT COMPONENT-817;Lo;0;L;;;;;N;;;;; +18DB1;TANGUT COMPONENT-818;Lo;0;L;;;;;N;;;;; +18DB2;TANGUT COMPONENT-819;Lo;0;L;;;;;N;;;;; +18DB3;TANGUT COMPONENT-820;Lo;0;L;;;;;N;;;;; +18DB4;TANGUT COMPONENT-821;Lo;0;L;;;;;N;;;;; +18DB5;TANGUT COMPONENT-822;Lo;0;L;;;;;N;;;;; +18DB6;TANGUT COMPONENT-823;Lo;0;L;;;;;N;;;;; +18DB7;TANGUT COMPONENT-824;Lo;0;L;;;;;N;;;;; +18DB8;TANGUT COMPONENT-825;Lo;0;L;;;;;N;;;;; +18DB9;TANGUT COMPONENT-826;Lo;0;L;;;;;N;;;;; +18DBA;TANGUT COMPONENT-827;Lo;0;L;;;;;N;;;;; +18DBB;TANGUT COMPONENT-828;Lo;0;L;;;;;N;;;;; +18DBC;TANGUT COMPONENT-829;Lo;0;L;;;;;N;;;;; +18DBD;TANGUT COMPONENT-830;Lo;0;L;;;;;N;;;;; +18DBE;TANGUT COMPONENT-831;Lo;0;L;;;;;N;;;;; +18DBF;TANGUT COMPONENT-832;Lo;0;L;;;;;N;;;;; +18DC0;TANGUT COMPONENT-833;Lo;0;L;;;;;N;;;;; +18DC1;TANGUT COMPONENT-834;Lo;0;L;;;;;N;;;;; +18DC2;TANGUT COMPONENT-835;Lo;0;L;;;;;N;;;;; +18DC3;TANGUT COMPONENT-836;Lo;0;L;;;;;N;;;;; +18DC4;TANGUT COMPONENT-837;Lo;0;L;;;;;N;;;;; +18DC5;TANGUT COMPONENT-838;Lo;0;L;;;;;N;;;;; +18DC6;TANGUT COMPONENT-839;Lo;0;L;;;;;N;;;;; +18DC7;TANGUT COMPONENT-840;Lo;0;L;;;;;N;;;;; +18DC8;TANGUT COMPONENT-841;Lo;0;L;;;;;N;;;;; +18DC9;TANGUT COMPONENT-842;Lo;0;L;;;;;N;;;;; +18DCA;TANGUT COMPONENT-843;Lo;0;L;;;;;N;;;;; +18DCB;TANGUT COMPONENT-844;Lo;0;L;;;;;N;;;;; +18DCC;TANGUT COMPONENT-845;Lo;0;L;;;;;N;;;;; +18DCD;TANGUT COMPONENT-846;Lo;0;L;;;;;N;;;;; +18DCE;TANGUT COMPONENT-847;Lo;0;L;;;;;N;;;;; +18DCF;TANGUT COMPONENT-848;Lo;0;L;;;;;N;;;;; +18DD0;TANGUT COMPONENT-849;Lo;0;L;;;;;N;;;;; +18DD1;TANGUT COMPONENT-850;Lo;0;L;;;;;N;;;;; +18DD2;TANGUT COMPONENT-851;Lo;0;L;;;;;N;;;;; +18DD3;TANGUT COMPONENT-852;Lo;0;L;;;;;N;;;;; +18DD4;TANGUT COMPONENT-853;Lo;0;L;;;;;N;;;;; +18DD5;TANGUT COMPONENT-854;Lo;0;L;;;;;N;;;;; +18DD6;TANGUT COMPONENT-855;Lo;0;L;;;;;N;;;;; +18DD7;TANGUT COMPONENT-856;Lo;0;L;;;;;N;;;;; +18DD8;TANGUT COMPONENT-857;Lo;0;L;;;;;N;;;;; +18DD9;TANGUT COMPONENT-858;Lo;0;L;;;;;N;;;;; +18DDA;TANGUT COMPONENT-859;Lo;0;L;;;;;N;;;;; +18DDB;TANGUT COMPONENT-860;Lo;0;L;;;;;N;;;;; +18DDC;TANGUT COMPONENT-861;Lo;0;L;;;;;N;;;;; +18DDD;TANGUT COMPONENT-862;Lo;0;L;;;;;N;;;;; +18DDE;TANGUT COMPONENT-863;Lo;0;L;;;;;N;;;;; +18DDF;TANGUT COMPONENT-864;Lo;0;L;;;;;N;;;;; +18DE0;TANGUT COMPONENT-865;Lo;0;L;;;;;N;;;;; +18DE1;TANGUT COMPONENT-866;Lo;0;L;;;;;N;;;;; +18DE2;TANGUT COMPONENT-867;Lo;0;L;;;;;N;;;;; +18DE3;TANGUT COMPONENT-868;Lo;0;L;;;;;N;;;;; +18DE4;TANGUT COMPONENT-869;Lo;0;L;;;;;N;;;;; +18DE5;TANGUT COMPONENT-870;Lo;0;L;;;;;N;;;;; +18DE6;TANGUT COMPONENT-871;Lo;0;L;;;;;N;;;;; +18DE7;TANGUT COMPONENT-872;Lo;0;L;;;;;N;;;;; +18DE8;TANGUT COMPONENT-873;Lo;0;L;;;;;N;;;;; +18DE9;TANGUT COMPONENT-874;Lo;0;L;;;;;N;;;;; +18DEA;TANGUT COMPONENT-875;Lo;0;L;;;;;N;;;;; +18DEB;TANGUT COMPONENT-876;Lo;0;L;;;;;N;;;;; +18DEC;TANGUT COMPONENT-877;Lo;0;L;;;;;N;;;;; +18DED;TANGUT COMPONENT-878;Lo;0;L;;;;;N;;;;; +18DEE;TANGUT COMPONENT-879;Lo;0;L;;;;;N;;;;; +18DEF;TANGUT COMPONENT-880;Lo;0;L;;;;;N;;;;; +18DF0;TANGUT COMPONENT-881;Lo;0;L;;;;;N;;;;; +18DF1;TANGUT COMPONENT-882;Lo;0;L;;;;;N;;;;; +18DF2;TANGUT COMPONENT-883;Lo;0;L;;;;;N;;;;; 1AFF0;KATAKANA LETTER MINNAN TONE-2;Lm;0;L;;;;;N;;;;; 1AFF1;KATAKANA LETTER MINNAN TONE-3;Lm;0;L;;;;;N;;;;; 1AFF2;KATAKANA LETTER MINNAN TONE-4;Lm;0;L;;;;;N;;;;; @@ -32629,6 +32963,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1CCF7;OUTLINED DIGIT SEVEN;Nd;0;EN; 0037;7;7;7;N;;;;; 1CCF8;OUTLINED DIGIT EIGHT;Nd;0;EN; 0038;8;8;8;N;;;;; 1CCF9;OUTLINED DIGIT NINE;Nd;0;EN; 0039;9;9;9;N;;;;; +1CCFA;SNAKE SYMBOL;So;0;ON;;;;;N;;;;; +1CCFB;FLYING SAUCER SYMBOL;So;0;ON;;;;;N;;;;; +1CCFC;NOSE SYMBOL;So;0;ON;;;;;N;;;;; 1CD00;BLOCK OCTANT-3;So;0;ON;;;;;N;;;;; 1CD01;BLOCK OCTANT-23;So;0;ON;;;;;N;;;;; 1CD02;BLOCK OCTANT-123;So;0;ON;;;;;N;;;;; @@ -33065,6 +33402,46 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1CEB1;KEYHOLE;So;0;ON;;;;;N;;;;; 1CEB2;OLD PERSONAL COMPUTER WITH MONITOR IN PORTRAIT ORIENTATION;So;0;ON;;;;;N;;;;; 1CEB3;BLACK RIGHT TRIANGLE CARET;So;0;ON;;;;;N;;;;; +1CEBA;FRAGILE SYMBOL;So;0;ON;;;;;N;;;;; +1CEBB;OFFICE BUILDING SYMBOL;So;0;ON;;;;;N;;;;; +1CEBC;TREE SYMBOL;So;0;ON;;;;;N;;;;; +1CEBD;APPLE SYMBOL;So;0;ON;;;;;N;;;;; +1CEBE;CHERRY SYMBOL;So;0;ON;;;;;N;;;;; +1CEBF;STRAWBERRY SYMBOL;So;0;ON;;;;;N;;;;; +1CEC0;HEBE;So;0;ON;;;;;N;;;;; +1CEC1;IRIS;So;0;ON;;;;;N;;;;; +1CEC2;FLORA;So;0;ON;;;;;N;;;;; +1CEC3;METIS;So;0;ON;;;;;N;;;;; +1CEC4;PARTHENOPE;So;0;ON;;;;;N;;;;; +1CEC5;VICTORIA;So;0;ON;;;;;N;;;;; +1CEC6;EGERIA;So;0;ON;;;;;N;;;;; +1CEC7;IRENE;So;0;ON;;;;;N;;;;; +1CEC8;EUNOMIA;So;0;ON;;;;;N;;;;; +1CEC9;PSYCHE;So;0;ON;;;;;N;;;;; +1CECA;THETIS;So;0;ON;;;;;N;;;;; +1CECB;MELPOMENE;So;0;ON;;;;;N;;;;; +1CECC;FORTUNA;So;0;ON;;;;;N;;;;; +1CECD;ASTRONOMICAL SYMBOL FOR ASTEROID PROSERPINA;So;0;ON;;;;;N;;;;; +1CECE;BELLONA;So;0;ON;;;;;N;;;;; +1CECF;AMPHITRITE;So;0;ON;;;;;N;;;;; +1CED0;LEUKOTHEA;So;0;ON;;;;;N;;;;; +1CEE0;GEOMANTIC FIGURE POPULUS;So;0;ON;;;;;N;;;;; +1CEE1;GEOMANTIC FIGURE TRISTITIA;So;0;ON;;;;;N;;;;; +1CEE2;GEOMANTIC FIGURE ALBUS;So;0;ON;;;;;N;;;;; +1CEE3;GEOMANTIC FIGURE FORTUNA MAJOR;So;0;ON;;;;;N;;;;; +1CEE4;GEOMANTIC FIGURE RUBEUS;So;0;ON;;;;;N;;;;; +1CEE5;GEOMANTIC FIGURE ACQUISITIO;So;0;ON;;;;;N;;;;; +1CEE6;GEOMANTIC FIGURE CONJUNCTIO;So;0;ON;;;;;N;;;;; +1CEE7;GEOMANTIC FIGURE CAPUT DRACONIS;So;0;ON;;;;;N;;;;; +1CEE8;GEOMANTIC FIGURE LAETITIA;So;0;ON;;;;;N;;;;; +1CEE9;GEOMANTIC FIGURE CARCER;So;0;ON;;;;;N;;;;; +1CEEA;GEOMANTIC FIGURE AMISSIO;So;0;ON;;;;;N;;;;; +1CEEB;GEOMANTIC FIGURE PUELLA;So;0;ON;;;;;N;;;;; +1CEEC;GEOMANTIC FIGURE FORTUNA MINOR;So;0;ON;;;;;N;;;;; +1CEED;GEOMANTIC FIGURE PUER;So;0;ON;;;;;N;;;;; +1CEEE;GEOMANTIC FIGURE CAUDA DRACONIS;So;0;ON;;;;;N;;;;; +1CEEF;GEOMANTIC FIGURE VIA;So;0;ON;;;;;N;;;;; +1CEF0;MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR;Sm;0;ON;;;;;N;;;;; 1CF00;ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT;Mn;0;NSM;;;;;N;;;;; 1CF01;ZNAMENNY COMBINING MARK NIZKO S KRYZHEM ON LEFT;Mn;0;NSM;;;;;N;;;;; 1CF02;ZNAMENNY COMBINING MARK TSATA ON LEFT;Mn;0;NSM;;;;;N;;;;; @@ -36004,6 +36381,61 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1E5F9;OL ONAL DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 1E5FA;OL ONAL DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 1E5FF;OL ONAL ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; +1E6C0;TAI YO LETTER LOW KO;Lo;0;L;;;;;N;;;;; +1E6C1;TAI YO LETTER HIGH KO;Lo;0;L;;;;;N;;;;; +1E6C2;TAI YO LETTER LOW KHO;Lo;0;L;;;;;N;;;;; +1E6C3;TAI YO LETTER HIGH KHO;Lo;0;L;;;;;N;;;;; +1E6C4;TAI YO LETTER GO;Lo;0;L;;;;;N;;;;; +1E6C5;TAI YO LETTER NGO;Lo;0;L;;;;;N;;;;; +1E6C6;TAI YO LETTER CO;Lo;0;L;;;;;N;;;;; +1E6C7;TAI YO LETTER LOW XO;Lo;0;L;;;;;N;;;;; +1E6C8;TAI YO LETTER HIGH XO;Lo;0;L;;;;;N;;;;; +1E6C9;TAI YO LETTER LOW NYO;Lo;0;L;;;;;N;;;;; +1E6CA;TAI YO LETTER HIGH NYO;Lo;0;L;;;;;N;;;;; +1E6CB;TAI YO LETTER DO;Lo;0;L;;;;;N;;;;; +1E6CC;TAI YO LETTER LOW TO;Lo;0;L;;;;;N;;;;; +1E6CD;TAI YO LETTER HIGH TO;Lo;0;L;;;;;N;;;;; +1E6CE;TAI YO LETTER THO;Lo;0;L;;;;;N;;;;; +1E6CF;TAI YO LETTER NO;Lo;0;L;;;;;N;;;;; +1E6D0;TAI YO LETTER BO;Lo;0;L;;;;;N;;;;; +1E6D1;TAI YO LETTER LOW PO;Lo;0;L;;;;;N;;;;; +1E6D2;TAI YO LETTER HIGH PO;Lo;0;L;;;;;N;;;;; +1E6D3;TAI YO LETTER PHO;Lo;0;L;;;;;N;;;;; +1E6D4;TAI YO LETTER LOW FO;Lo;0;L;;;;;N;;;;; +1E6D5;TAI YO LETTER HIGH FO;Lo;0;L;;;;;N;;;;; +1E6D6;TAI YO LETTER MO;Lo;0;L;;;;;N;;;;; +1E6D7;TAI YO LETTER YO;Lo;0;L;;;;;N;;;;; +1E6D8;TAI YO LETTER LO;Lo;0;L;;;;;N;;;;; +1E6D9;TAI YO LETTER VO;Lo;0;L;;;;;N;;;;; +1E6DA;TAI YO LETTER LOW HO;Lo;0;L;;;;;N;;;;; +1E6DB;TAI YO LETTER HIGH HO;Lo;0;L;;;;;N;;;;; +1E6DC;TAI YO LETTER QO;Lo;0;L;;;;;N;;;;; +1E6DD;TAI YO LETTER LOW KVO;Lo;0;L;;;;;N;;;;; +1E6DE;TAI YO LETTER HIGH KVO;Lo;0;L;;;;;N;;;;; +1E6E0;TAI YO LETTER AA;Lo;0;L;;;;;N;;;;; +1E6E1;TAI YO LETTER I;Lo;0;L;;;;;N;;;;; +1E6E2;TAI YO LETTER UE;Lo;0;L;;;;;N;;;;; +1E6E3;TAI YO SIGN UE;Mn;230;NSM;;;;;N;;;;; +1E6E4;TAI YO LETTER U;Lo;0;L;;;;;N;;;;; +1E6E5;TAI YO LETTER AE;Lo;0;L;;;;;N;;;;; +1E6E6;TAI YO SIGN AU;Mn;230;NSM;;;;;N;;;;; +1E6E7;TAI YO LETTER O;Lo;0;L;;;;;N;;;;; +1E6E8;TAI YO LETTER E;Lo;0;L;;;;;N;;;;; +1E6E9;TAI YO LETTER IA;Lo;0;L;;;;;N;;;;; +1E6EA;TAI YO LETTER UEA;Lo;0;L;;;;;N;;;;; +1E6EB;TAI YO LETTER UA;Lo;0;L;;;;;N;;;;; +1E6EC;TAI YO LETTER OO;Lo;0;L;;;;;N;;;;; +1E6ED;TAI YO LETTER AUE;Lo;0;L;;;;;N;;;;; +1E6EE;TAI YO SIGN AY;Mn;230;NSM;;;;;N;;;;; +1E6EF;TAI YO SIGN ANG;Mn;230;NSM;;;;;N;;;;; +1E6F0;TAI YO LETTER AN;Lo;0;L;;;;;N;;;;; +1E6F1;TAI YO LETTER AM;Lo;0;L;;;;;N;;;;; +1E6F2;TAI YO LETTER AK;Lo;0;L;;;;;N;;;;; +1E6F3;TAI YO LETTER AT;Lo;0;L;;;;;N;;;;; +1E6F4;TAI YO LETTER AP;Lo;0;L;;;;;N;;;;; +1E6F5;TAI YO SIGN OM;Mn;230;NSM;;;;;N;;;;; +1E6FE;TAI YO SYMBOL MUEANG;Lo;0;L;;;;;N;;;;; +1E6FF;TAI YO XAM LAI;Lm;0;L;;;;;N;;;;; 1E7E0;ETHIOPIC SYLLABLE HHYA;Lo;0;L;;;;;N;;;;; 1E7E1;ETHIOPIC SYLLABLE HHYU;Lo;0;L;;;;;N;;;;; 1E7E2;ETHIOPIC SYLLABLE HHYI;Lo;0;L;;;;;N;;;;; @@ -38079,6 +38511,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1F6D5;HINDU TEMPLE;So;0;ON;;;;;N;;;;; 1F6D6;HUT;So;0;ON;;;;;N;;;;; 1F6D7;ELEVATOR;So;0;ON;;;;;N;;;;; +1F6D8;LANDSLIDE;So;0;ON;;;;;N;;;;; 1F6DC;WIRELESS;So;0;ON;;;;;N;;;;; 1F6DD;PLAYGROUND SLIDE;So;0;ON;;;;;N;;;;; 1F6DE;WHEEL;So;0;ON;;;;;N;;;;; @@ -38228,6 +38661,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1F774;LOT OF FORTUNE;So;0;ON;;;;;N;;;;; 1F775;OCCULTATION;So;0;ON;;;;;N;;;;; 1F776;LUNAR ECLIPSE;So;0;ON;;;;;N;;;;; +1F777;VESTA FORM TWO;So;0;ON;;;;;N;;;;; +1F778;ASTRAEA FORM TWO;So;0;ON;;;;;N;;;;; +1F779;HYGIEA FORM TWO;So;0;ON;;;;;N;;;;; +1F77A;PARTHENOPE FORM TWO;So;0;ON;;;;;N;;;;; 1F77B;HAUMEA;So;0;ON;;;;;N;;;;; 1F77C;MAKEMAKE;So;0;ON;;;;;N;;;;; 1F77D;GONGGONG;So;0;ON;;;;;N;;;;; @@ -38498,6 +38935,15 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1F8BB;SOUTH WEST ARROW FROM BAR;So;0;ON;;;;;N;;;;; 1F8C0;LEFTWARDS ARROW FROM DOWNWARDS ARROW;So;0;ON;;;;;N;;;;; 1F8C1;RIGHTWARDS ARROW FROM DOWNWARDS ARROW;So;0;ON;;;;;N;;;;; +1F8D0;LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW;Sm;0;ON;;;;;N;;;;; +1F8D1;LONG RIGHTWARDS HARPOON OVER LONG LEFTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D2;LONG RIGHTWARDS HARPOON ABOVE SHORT LEFTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D3;SHORT RIGHTWARDS HARPOON ABOVE LONG LEFTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D4;LONG LEFTWARDS HARPOON ABOVE SHORT RIGHTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D5;SHORT LEFTWARDS HARPOON ABOVE LONG RIGHTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D6;LONG RIGHTWARDS ARROW THROUGH X;Sm;0;ON;;;;;N;;;;; +1F8D7;LONG RIGHTWARDS ARROW WITH DOUBLE SLASH;Sm;0;ON;;;;;N;;;;; +1F8D8;LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE;Sm;0;ON;;;;;N;;;;; 1F900;CIRCLED CROSS FORMEE WITH FOUR DOTS;So;0;ON;;;;;N;;;;; 1F901;CIRCLED CROSS FORMEE WITH TWO DOTS;So;0;ON;;;;;N;;;;; 1F902;CIRCLED CROSS FORMEE;So;0;ON;;;;;N;;;;; @@ -38838,6 +39284,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FA51;BLACK CHESS KNIGHT-QUEEN;So;0;ON;;;;;N;;;;; 1FA52;BLACK CHESS KNIGHT-ROOK;So;0;ON;;;;;N;;;;; 1FA53;BLACK CHESS KNIGHT-BISHOP;So;0;ON;;;;;N;;;;; +1FA54;WHITE CHESS FERZ;So;0;ON;;;;;N;;;;; +1FA55;WHITE CHESS ALFIL;So;0;ON;;;;;N;;;;; +1FA56;BLACK CHESS FERZ;So;0;ON;;;;;N;;;;; +1FA57;BLACK CHESS ALFIL;So;0;ON;;;;;N;;;;; 1FA60;XIANGQI RED GENERAL;So;0;ON;;;;;N;;;;; 1FA61;XIANGQI RED MANDARIN;So;0;ON;;;;;N;;;;; 1FA62;XIANGQI RED ELEPHANT;So;0;ON;;;;;N;;;;; @@ -38875,6 +39325,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FA87;MARACAS;So;0;ON;;;;;N;;;;; 1FA88;FLUTE;So;0;ON;;;;;N;;;;; 1FA89;HARP;So;0;ON;;;;;N;;;;; +1FA8A;TROMBONE;So;0;ON;;;;;N;;;;; +1FA8E;TREASURE CHEST;So;0;ON;;;;;N;;;;; 1FA8F;SHOVEL;So;0;ON;;;;;N;;;;; 1FA90;RINGED PLANET;So;0;ON;;;;;N;;;;; 1FA91;CHAIR;So;0;ON;;;;;N;;;;; @@ -38931,6 +39383,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FAC4;PREGNANT PERSON;So;0;ON;;;;;N;;;;; 1FAC5;PERSON WITH CROWN;So;0;ON;;;;;N;;;;; 1FAC6;FINGERPRINT;So;0;ON;;;;;N;;;;; +1FAC8;HAIRY CREATURE;So;0;ON;;;;;N;;;;; +1FACD;ORCA;So;0;ON;;;;;N;;;;; 1FACE;MOOSE;So;0;ON;;;;;N;;;;; 1FACF;DONKEY;So;0;ON;;;;;N;;;;; 1FAD0;BLUEBERRIES;So;0;ON;;;;;N;;;;; @@ -38957,6 +39411,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FAE7;BUBBLES;So;0;ON;;;;;N;;;;; 1FAE8;SHAKING FACE;So;0;ON;;;;;N;;;;; 1FAE9;FACE WITH BAGS UNDER EYES;So;0;ON;;;;;N;;;;; +1FAEA;DISTORTED FACE;So;0;ON;;;;;N;;;;; +1FAEF;FIGHT CLOUD;So;0;ON;;;;;N;;;;; 1FAF0;HAND WITH INDEX FINGER AND THUMB CROSSED;So;0;ON;;;;;N;;;;; 1FAF1;RIGHTWARDS HAND;So;0;ON;;;;;N;;;;; 1FAF2;LEFTWARDS HAND;So;0;ON;;;;;N;;;;; @@ -39215,14 +39671,15 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FBF7;SEGMENTED DIGIT SEVEN;Nd;0;EN; 0037;7;7;7;N;;;;; 1FBF8;SEGMENTED DIGIT EIGHT;Nd;0;EN; 0038;8;8;8;N;;;;; 1FBF9;SEGMENTED DIGIT NINE;Nd;0;EN; 0039;9;9;9;N;;;;; +1FBFA;ALARM BELL SYMBOL;So;0;ON;;;;;N;;;;; 20000;;Lo;0;L;;;;;N;;;;; 2A6DF;;Lo;0;L;;;;;N;;;;; 2A700;;Lo;0;L;;;;;N;;;;; -2B739;;Lo;0;L;;;;;N;;;;; +2B73F;;Lo;0;L;;;;;N;;;;; 2B740;;Lo;0;L;;;;;N;;;;; 2B81D;;Lo;0;L;;;;;N;;;;; 2B820;;Lo;0;L;;;;;N;;;;; -2CEA1;;Lo;0;L;;;;;N;;;;; +2CEAD;;Lo;0;L;;;;;N;;;;; 2CEB0;;Lo;0;L;;;;;N;;;;; 2EBE0;;Lo;0;L;;;;;N;;;;; 2EBF0;;Lo;0;L;;;;;N;;;;; @@ -39773,6 +40230,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 3134A;;Lo;0;L;;;;;N;;;;; 31350;;Lo;0;L;;;;;N;;;;; 323AF;;Lo;0;L;;;;;N;;;;; +323B0;;Lo;0;L;;;;;N;;;;; +33479;;Lo;0;L;;;;;N;;;;; E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;; E0020;TAG SPACE;Cf;0;BN;;;;;N;;;;; E0021;TAG EXCLAMATION MARK;Cf;0;BN;;;;;N;;;;; diff --git a/src/libre/class.h b/src/libre/class.h index 919571982..79460a9da 100644 --- a/src/libre/class.h +++ b/src/libre/class.h @@ -29,6 +29,7 @@ extern const struct class utf8_Bamum; extern const struct class utf8_Bassa_Vah; extern const struct class utf8_Batak; extern const struct class utf8_Bengali; +extern const struct class utf8_Beria_Erfe; extern const struct class utf8_Bhaiksuki; extern const struct class utf8_Bopomofo; extern const struct class utf8_Brahmi; @@ -153,6 +154,7 @@ extern const struct class utf8_Saurashtra; extern const struct class utf8_Sharada; extern const struct class utf8_Shavian; extern const struct class utf8_Siddham; +extern const struct class utf8_Sidetic; extern const struct class utf8_SignWriting; extern const struct class utf8_Sinhala; extern const struct class utf8_Sogdian; @@ -167,6 +169,7 @@ extern const struct class utf8_Tagbanwa; extern const struct class utf8_Tai_Le; extern const struct class utf8_Tai_Tham; extern const struct class utf8_Tai_Viet; +extern const struct class utf8_Tai_Yo; extern const struct class utf8_Takri; extern const struct class utf8_Tamil; extern const struct class utf8_Tangsa; @@ -178,6 +181,7 @@ extern const struct class utf8_Tibetan; extern const struct class utf8_Tifinagh; extern const struct class utf8_Tirhuta; extern const struct class utf8_Todhri; +extern const struct class utf8_Tolong_Siki; extern const struct class utf8_Toto; extern const struct class utf8_Tulu_Tigalari; extern const struct class utf8_Ugaritic; diff --git a/src/libre/class/utf8_Arabic.c b/src/libre/class/utf8_Arabic.c index 904ab392e..592e06d7d 100644 --- a/src/libre/class/utf8_Arabic.c +++ b/src/libre/class/utf8_Arabic.c @@ -13,21 +13,18 @@ static const struct range ranges[] = { { 0x0671UL, 0x06DCUL }, { 0x06DEUL, 0x06FFUL }, { 0x0750UL, 0x077FUL }, - { 0x0870UL, 0x088EUL }, - { 0x0890UL, 0x0891UL }, + { 0x0870UL, 0x0891UL }, { 0x0897UL, 0x08E1UL }, { 0x08E3UL, 0x08FFUL }, - { 0xFB50UL, 0xFBC2UL }, - { 0xFBD3UL, 0xFD3DUL }, - { 0xFD40UL, 0xFD8FUL }, - { 0xFD92UL, 0xFDC7UL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFB50UL, 0xFD3DUL }, + { 0xFD40UL, 0xFDCFUL }, { 0xFDF0UL, 0xFDFFUL }, { 0xFE70UL, 0xFE74UL }, { 0xFE76UL, 0xFEFCUL }, { 0x10E60UL, 0x10E7EUL }, - { 0x10EC2UL, 0x10EC4UL }, - { 0x10EFCUL, 0x10EFFUL }, + { 0x10EC2UL, 0x10EC7UL }, + { 0x10ED0UL, 0x10ED8UL }, + { 0x10EFAUL, 0x10EFFUL }, { 0x1EE00UL, 0x1EE03UL }, { 0x1EE05UL, 0x1EE1FUL }, { 0x1EE21UL, 0x1EE22UL }, diff --git a/src/libre/class/utf8_Beria_Erfe.c b/src/libre/class/utf8_Beria_Erfe.c new file mode 100644 index 000000000..3bf7b4db7 --- /dev/null +++ b/src/libre/class/utf8_Beria_Erfe.c @@ -0,0 +1,14 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x16EA0UL, 0x16EB8UL }, + { 0x16EBBUL, 0x16ED3UL } +}; + +const struct class utf8_Beria_Erfe = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_Common.c b/src/libre/class/utf8_Common.c index 44e6f3ff2..26c3f54e8 100644 --- a/src/libre/class/utf8_Common.c +++ b/src/libre/class/utf8_Common.c @@ -43,7 +43,7 @@ static const struct range ranges[] = { { 0x2066UL, 0x2070UL }, { 0x2074UL, 0x207EUL }, { 0x2080UL, 0x208EUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0x2100UL, 0x2125UL }, { 0x2127UL, 0x2129UL }, { 0x212CUL, 0x2131UL }, @@ -54,8 +54,7 @@ static const struct range ranges[] = { { 0x2440UL, 0x244AUL }, { 0x2460UL, 0x27FFUL }, { 0x2900UL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2BFFUL }, + { 0x2B76UL, 0x2BFFUL }, { 0x2E00UL, 0x2E5DUL }, { 0x2FF0UL, 0x3004UL }, { 0x3006UL, 0x3006UL }, @@ -101,8 +100,10 @@ static const struct range ranges[] = { { 0x101D0UL, 0x101FCUL }, { 0x102E1UL, 0x102FBUL }, { 0x1BCA0UL, 0x1BCA3UL }, - { 0x1CC00UL, 0x1CCF9UL }, + { 0x1CC00UL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEF0UL }, { 0x1CF50UL, 0x1CFC3UL }, { 0x1D000UL, 0x1D0F5UL }, { 0x1D100UL, 0x1D126UL }, @@ -151,11 +152,10 @@ static const struct range ranges[] = { { 0x1F240UL, 0x1F248UL }, { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, - { 0x1F300UL, 0x1F6D7UL }, + { 0x1F300UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -165,16 +165,18 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F8D0UL, 0x1F8D8UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBF9UL }, + { 0x1FB94UL, 0x1FBFAUL }, { 0xE0001UL, 0xE0001UL }, { 0xE0020UL, 0xE007FUL } }; diff --git a/src/libre/class/utf8_Han.c b/src/libre/class/utf8_Han.c index ce3bbce61..0fdf663e0 100644 --- a/src/libre/class/utf8_Han.c +++ b/src/libre/class/utf8_Han.c @@ -29,16 +29,15 @@ static const struct range ranges[] = { { 0xF900UL, 0xFA6DUL }, { 0xFA70UL, 0xFAD9UL }, { 0x16FE2UL, 0x16FE3UL }, - { 0x16FF0UL, 0x16FF1UL }, + { 0x16FF0UL, 0x16FF6UL }, { 0x20000UL, 0x2A6DFUL }, - { 0x2A700UL, 0x2B739UL }, - { 0x2B740UL, 0x2B81DUL }, - { 0x2B820UL, 0x2CEA1UL }, + { 0x2A700UL, 0x2B81DUL }, + { 0x2B820UL, 0x2CEADUL }, { 0x2CEB0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EE5DUL }, { 0x2F800UL, 0x2FA1DUL }, { 0x30000UL, 0x3134AUL }, - { 0x31350UL, 0x323AFUL }, + { 0x31350UL, 0x33479UL }, { 0x1720UL, 0x1734UL }, { 0x10D00UL, 0x10D27UL }, { 0x10D30UL, 0x10D39UL } diff --git a/src/libre/class/utf8_Inherited.c b/src/libre/class/utf8_Inherited.c index 8e5afc76c..46ee6739a 100644 --- a/src/libre/class/utf8_Inherited.c +++ b/src/libre/class/utf8_Inherited.c @@ -8,7 +8,8 @@ static const struct range ranges[] = { { 0x064BUL, 0x0655UL }, { 0x0670UL, 0x0670UL }, { 0x0951UL, 0x0954UL }, - { 0x1AB0UL, 0x1ACEUL }, + { 0x1AB0UL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1CD0UL, 0x1CD2UL }, { 0x1CD4UL, 0x1CE0UL }, { 0x1CE2UL, 0x1CE8UL }, diff --git a/src/libre/class/utf8_Kannada.c b/src/libre/class/utf8_Kannada.c index c2f9ae6e6..a4f8ebd40 100644 --- a/src/libre/class/utf8_Kannada.c +++ b/src/libre/class/utf8_Kannada.c @@ -12,7 +12,7 @@ static const struct range ranges[] = { { 0x0CC6UL, 0x0CC8UL }, { 0x0CCAUL, 0x0CCDUL }, { 0x0CD5UL, 0x0CD6UL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE3UL }, { 0x0CE6UL, 0x0CEFUL }, { 0x0CF1UL, 0x0CF3UL } diff --git a/src/libre/class/utf8_L.c b/src/libre/class/utf8_L.c index b2c396fcc..e37048bc1 100644 --- a/src/libre/class/utf8_L.c +++ b/src/libre/class/utf8_L.c @@ -53,7 +53,7 @@ static const struct range ranges[] = { { 0x0840UL, 0x0858UL }, { 0x0860UL, 0x086AUL }, { 0x0870UL, 0x0887UL }, - { 0x0889UL, 0x088EUL }, + { 0x0889UL, 0x088FUL }, { 0x08A0UL, 0x08C9UL }, { 0x0904UL, 0x0939UL }, { 0x093DUL, 0x093DUL }, @@ -119,7 +119,7 @@ static const struct range ranges[] = { { 0x0C2AUL, 0x0C39UL }, { 0x0C3DUL, 0x0C3DUL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C61UL }, { 0x0C80UL, 0x0C80UL }, { 0x0C85UL, 0x0C8CUL }, @@ -128,7 +128,7 @@ static const struct range ranges[] = { { 0x0CAAUL, 0x0CB3UL }, { 0x0CB5UL, 0x0CB9UL }, { 0x0CBDUL, 0x0CBDUL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE1UL }, { 0x0CF1UL, 0x0CF2UL }, { 0x0D04UL, 0x0D0CUL }, @@ -314,11 +314,8 @@ static const struct range ranges[] = { { 0xA6A0UL, 0xA6E5UL }, { 0xA717UL, 0xA71FUL }, { 0xA722UL, 0xA788UL }, - { 0xA78BUL, 0xA7CDUL }, - { 0xA7D0UL, 0xA7D1UL }, - { 0xA7D3UL, 0xA7D3UL }, - { 0xA7D5UL, 0xA7DCUL }, - { 0xA7F2UL, 0xA801UL }, + { 0xA78BUL, 0xA7DCUL }, + { 0xA7F1UL, 0xA801UL }, { 0xA803UL, 0xA805UL }, { 0xA807UL, 0xA80AUL }, { 0xA80CUL, 0xA822UL }, @@ -434,6 +431,7 @@ static const struct range ranges[] = { { 0x108F4UL, 0x108F5UL }, { 0x10900UL, 0x10915UL }, { 0x10920UL, 0x10939UL }, + { 0x10940UL, 0x10959UL }, { 0x10980UL, 0x109B7UL }, { 0x109BEUL, 0x109BFUL }, { 0x10A00UL, 0x10A00UL }, @@ -456,7 +454,7 @@ static const struct range ranges[] = { { 0x10D6FUL, 0x10D85UL }, { 0x10E80UL, 0x10EA9UL }, { 0x10EB0UL, 0x10EB1UL }, - { 0x10EC2UL, 0x10EC4UL }, + { 0x10EC2UL, 0x10EC7UL }, { 0x10F00UL, 0x10F1CUL }, { 0x10F27UL, 0x10F27UL }, { 0x10F30UL, 0x10F45UL }, @@ -549,6 +547,7 @@ static const struct range ranges[] = { { 0x11D67UL, 0x11D68UL }, { 0x11D6AUL, 0x11D89UL }, { 0x11D98UL, 0x11D98UL }, + { 0x11DB0UL, 0x11DDBUL }, { 0x11EE0UL, 0x11EF2UL }, { 0x11F02UL, 0x11F02UL }, { 0x11F04UL, 0x11F10UL }, @@ -572,16 +571,19 @@ static const struct range ranges[] = { { 0x16B7DUL, 0x16B8FUL }, { 0x16D40UL, 0x16D6CUL }, { 0x16E40UL, 0x16E7FUL }, + { 0x16EA0UL, 0x16EB8UL }, + { 0x16EBBUL, 0x16ED3UL }, { 0x16F00UL, 0x16F4AUL }, { 0x16F50UL, 0x16F50UL }, { 0x16F93UL, 0x16F9FUL }, { 0x16FE0UL, 0x16FE1UL }, { 0x16FE3UL, 0x16FE3UL }, + { 0x16FF2UL, 0x16FF3UL }, { 0x17000UL, 0x17000UL }, - { 0x187F7UL, 0x187F7UL }, - { 0x18800UL, 0x18CD5UL }, + { 0x187FFUL, 0x18CD5UL }, { 0x18CFFUL, 0x18D00UL }, - { 0x18D08UL, 0x18D08UL }, + { 0x18D1EUL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL }, { 0x1AFF0UL, 0x1AFF3UL }, { 0x1AFF5UL, 0x1AFFBUL }, { 0x1AFFDUL, 0x1AFFEUL }, @@ -636,6 +638,12 @@ static const struct range ranges[] = { { 0x1E4D0UL, 0x1E4EBUL }, { 0x1E5D0UL, 0x1E5EDUL }, { 0x1E5F0UL, 0x1E5F0UL }, + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6E2UL }, + { 0x1E6E4UL, 0x1E6E5UL }, + { 0x1E6E7UL, 0x1E6EDUL }, + { 0x1E6F0UL, 0x1E6F4UL }, + { 0x1E6FEUL, 0x1E6FFUL }, { 0x1E7E0UL, 0x1E7E6UL }, { 0x1E7E8UL, 0x1E7EBUL }, { 0x1E7EDUL, 0x1E7EEUL }, @@ -679,11 +687,10 @@ static const struct range ranges[] = { { 0x20000UL, 0x20000UL }, { 0x2A6DFUL, 0x2A6DFUL }, { 0x2A700UL, 0x2A700UL }, - { 0x2B739UL, 0x2B739UL }, - { 0x2B740UL, 0x2B740UL }, + { 0x2B73FUL, 0x2B740UL }, { 0x2B81DUL, 0x2B81DUL }, { 0x2B820UL, 0x2B820UL }, - { 0x2CEA1UL, 0x2CEA1UL }, + { 0x2CEADUL, 0x2CEADUL }, { 0x2CEB0UL, 0x2CEB0UL }, { 0x2EBE0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EBF0UL }, @@ -692,7 +699,8 @@ static const struct range ranges[] = { { 0x30000UL, 0x30000UL }, { 0x3134AUL, 0x3134AUL }, { 0x31350UL, 0x31350UL }, - { 0x323AFUL, 0x323AFUL } + { 0x323AFUL, 0x323B0UL }, + { 0x33479UL, 0x33479UL } }; const struct class utf8_L = { diff --git a/src/libre/class/utf8_Latin.c b/src/libre/class/utf8_Latin.c index 9f59bd35e..362ab61a6 100644 --- a/src/libre/class/utf8_Latin.c +++ b/src/libre/class/utf8_Latin.c @@ -26,11 +26,8 @@ static const struct range ranges[] = { { 0x2160UL, 0x2188UL }, { 0x2C60UL, 0x2C7FUL }, { 0xA722UL, 0xA787UL }, - { 0xA78BUL, 0xA7CDUL }, - { 0xA7D0UL, 0xA7D1UL }, - { 0xA7D3UL, 0xA7D3UL }, - { 0xA7D5UL, 0xA7DCUL }, - { 0xA7F2UL, 0xA7FFUL }, + { 0xA78BUL, 0xA7DCUL }, + { 0xA7F1UL, 0xA7FFUL }, { 0xAB30UL, 0xAB5AUL }, { 0xAB5CUL, 0xAB64UL }, { 0xAB66UL, 0xAB69UL }, diff --git a/src/libre/class/utf8_Ll.c b/src/libre/class/utf8_Ll.c index 889a1da5e..ad884ead6 100644 --- a/src/libre/class/utf8_Ll.c +++ b/src/libre/class/utf8_Ll.c @@ -148,7 +148,7 @@ static const struct range ranges[] = { { 0x024BUL, 0x024BUL }, { 0x024DUL, 0x024DUL }, { 0x024FUL, 0x0293UL }, - { 0x0295UL, 0x02AFUL }, + { 0x0296UL, 0x02AFUL }, { 0x0371UL, 0x0371UL }, { 0x0373UL, 0x0373UL }, { 0x0377UL, 0x0377UL }, @@ -609,6 +609,7 @@ static const struct range ranges[] = { { 0xA7C8UL, 0xA7C8UL }, { 0xA7CAUL, 0xA7CAUL }, { 0xA7CDUL, 0xA7CDUL }, + { 0xA7CFUL, 0xA7CFUL }, { 0xA7D1UL, 0xA7D1UL }, { 0xA7D3UL, 0xA7D3UL }, { 0xA7D5UL, 0xA7D5UL }, @@ -633,6 +634,7 @@ static const struct range ranges[] = { { 0x10D70UL, 0x10D85UL }, { 0x118C0UL, 0x118DFUL }, { 0x16E60UL, 0x16E7FUL }, + { 0x16EBBUL, 0x16ED3UL }, { 0x1D41AUL, 0x1D433UL }, { 0x1D44EUL, 0x1D454UL }, { 0x1D456UL, 0x1D467UL }, diff --git a/src/libre/class/utf8_Lm.c b/src/libre/class/utf8_Lm.c index 884125481..3052e9072 100644 --- a/src/libre/class/utf8_Lm.c +++ b/src/libre/class/utf8_Lm.c @@ -49,7 +49,7 @@ static const struct range ranges[] = { { 0xA717UL, 0xA71FUL }, { 0xA770UL, 0xA770UL }, { 0xA788UL, 0xA788UL }, - { 0xA7F2UL, 0xA7F4UL }, + { 0xA7F1UL, 0xA7F4UL }, { 0xA7F8UL, 0xA7F9UL }, { 0xA9CFUL, 0xA9CFUL }, { 0xA9E6UL, 0xA9E6UL }, @@ -65,18 +65,22 @@ static const struct range ranges[] = { { 0x107B2UL, 0x107BAUL }, { 0x10D4EUL, 0x10D4EUL }, { 0x10D6FUL, 0x10D6FUL }, + { 0x10EC5UL, 0x10EC5UL }, + { 0x11DD9UL, 0x11DD9UL }, { 0x16B40UL, 0x16B43UL }, { 0x16D40UL, 0x16D42UL }, { 0x16D6BUL, 0x16D6CUL }, { 0x16F93UL, 0x16F9FUL }, { 0x16FE0UL, 0x16FE1UL }, { 0x16FE3UL, 0x16FE3UL }, + { 0x16FF2UL, 0x16FF3UL }, { 0x1AFF0UL, 0x1AFF3UL }, { 0x1AFF5UL, 0x1AFFBUL }, { 0x1AFFDUL, 0x1AFFEUL }, { 0x1E030UL, 0x1E06DUL }, { 0x1E137UL, 0x1E13DUL }, { 0x1E4EBUL, 0x1E4EBUL }, + { 0x1E6FFUL, 0x1E6FFUL }, { 0x1E94BUL, 0x1E94BUL } }; diff --git a/src/libre/class/utf8_Lo.c b/src/libre/class/utf8_Lo.c index 0ba2e7a9b..859b511dc 100644 --- a/src/libre/class/utf8_Lo.c +++ b/src/libre/class/utf8_Lo.c @@ -7,7 +7,7 @@ static const struct range ranges[] = { { 0x00BAUL, 0x00BAUL }, { 0x01BBUL, 0x01BBUL }, { 0x01C0UL, 0x01C3UL }, - { 0x0294UL, 0x0294UL }, + { 0x0294UL, 0x0295UL }, { 0x05D0UL, 0x05EAUL }, { 0x05EFUL, 0x05F2UL }, { 0x0620UL, 0x063FUL }, @@ -27,7 +27,7 @@ static const struct range ranges[] = { { 0x0840UL, 0x0858UL }, { 0x0860UL, 0x086AUL }, { 0x0870UL, 0x0887UL }, - { 0x0889UL, 0x088EUL }, + { 0x0889UL, 0x088FUL }, { 0x08A0UL, 0x08C8UL }, { 0x0904UL, 0x0939UL }, { 0x093DUL, 0x093DUL }, @@ -93,7 +93,7 @@ static const struct range ranges[] = { { 0x0C2AUL, 0x0C39UL }, { 0x0C3DUL, 0x0C3DUL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C61UL }, { 0x0C80UL, 0x0C80UL }, { 0x0C85UL, 0x0C8CUL }, @@ -102,7 +102,7 @@ static const struct range ranges[] = { { 0x0CAAUL, 0x0CB3UL }, { 0x0CB5UL, 0x0CB9UL }, { 0x0CBDUL, 0x0CBDUL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE1UL }, { 0x0CF1UL, 0x0CF2UL }, { 0x0D04UL, 0x0D0CUL }, @@ -331,6 +331,7 @@ static const struct range ranges[] = { { 0x108F4UL, 0x108F5UL }, { 0x10900UL, 0x10915UL }, { 0x10920UL, 0x10939UL }, + { 0x10940UL, 0x10959UL }, { 0x10980UL, 0x109B7UL }, { 0x109BEUL, 0x109BFUL }, { 0x10A00UL, 0x10A00UL }, @@ -352,6 +353,7 @@ static const struct range ranges[] = { { 0x10E80UL, 0x10EA9UL }, { 0x10EB0UL, 0x10EB1UL }, { 0x10EC2UL, 0x10EC4UL }, + { 0x10EC6UL, 0x10EC7UL }, { 0x10F00UL, 0x10F1CUL }, { 0x10F27UL, 0x10F27UL }, { 0x10F30UL, 0x10F45UL }, @@ -443,6 +445,8 @@ static const struct range ranges[] = { { 0x11D67UL, 0x11D68UL }, { 0x11D6AUL, 0x11D89UL }, { 0x11D98UL, 0x11D98UL }, + { 0x11DB0UL, 0x11DD8UL }, + { 0x11DDAUL, 0x11DDBUL }, { 0x11EE0UL, 0x11EF2UL }, { 0x11F02UL, 0x11F02UL }, { 0x11F04UL, 0x11F10UL }, @@ -467,10 +471,10 @@ static const struct range ranges[] = { { 0x16F00UL, 0x16F4AUL }, { 0x16F50UL, 0x16F50UL }, { 0x17000UL, 0x17000UL }, - { 0x187F7UL, 0x187F7UL }, - { 0x18800UL, 0x18CD5UL }, + { 0x187FFUL, 0x18CD5UL }, { 0x18CFFUL, 0x18D00UL }, - { 0x18D08UL, 0x18D08UL }, + { 0x18D1EUL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL }, { 0x1B000UL, 0x1B122UL }, { 0x1B132UL, 0x1B132UL }, { 0x1B150UL, 0x1B152UL }, @@ -489,6 +493,12 @@ static const struct range ranges[] = { { 0x1E4D0UL, 0x1E4EAUL }, { 0x1E5D0UL, 0x1E5EDUL }, { 0x1E5F0UL, 0x1E5F0UL }, + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6E2UL }, + { 0x1E6E4UL, 0x1E6E5UL }, + { 0x1E6E7UL, 0x1E6EDUL }, + { 0x1E6F0UL, 0x1E6F4UL }, + { 0x1E6FEUL, 0x1E6FEUL }, { 0x1E7E0UL, 0x1E7E6UL }, { 0x1E7E8UL, 0x1E7EBUL }, { 0x1E7EDUL, 0x1E7EEUL }, @@ -530,11 +540,10 @@ static const struct range ranges[] = { { 0x20000UL, 0x20000UL }, { 0x2A6DFUL, 0x2A6DFUL }, { 0x2A700UL, 0x2A700UL }, - { 0x2B739UL, 0x2B739UL }, - { 0x2B740UL, 0x2B740UL }, + { 0x2B73FUL, 0x2B740UL }, { 0x2B81DUL, 0x2B81DUL }, { 0x2B820UL, 0x2B820UL }, - { 0x2CEA1UL, 0x2CEA1UL }, + { 0x2CEADUL, 0x2CEADUL }, { 0x2CEB0UL, 0x2CEB0UL }, { 0x2EBE0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EBF0UL }, @@ -543,7 +552,8 @@ static const struct range ranges[] = { { 0x30000UL, 0x30000UL }, { 0x3134AUL, 0x3134AUL }, { 0x31350UL, 0x31350UL }, - { 0x323AFUL, 0x323AFUL } + { 0x323AFUL, 0x323B0UL }, + { 0x33479UL, 0x33479UL } }; const struct class utf8_Lo = { diff --git a/src/libre/class/utf8_Lu.c b/src/libre/class/utf8_Lu.c index 150038e7e..e1f752b18 100644 --- a/src/libre/class/utf8_Lu.c +++ b/src/libre/class/utf8_Lu.c @@ -605,7 +605,10 @@ static const struct range ranges[] = { { 0xA7C4UL, 0xA7C7UL }, { 0xA7C9UL, 0xA7C9UL }, { 0xA7CBUL, 0xA7CCUL }, + { 0xA7CEUL, 0xA7CEUL }, { 0xA7D0UL, 0xA7D0UL }, + { 0xA7D2UL, 0xA7D2UL }, + { 0xA7D4UL, 0xA7D4UL }, { 0xA7D6UL, 0xA7D6UL }, { 0xA7D8UL, 0xA7D8UL }, { 0xA7DAUL, 0xA7DAUL }, @@ -622,6 +625,7 @@ static const struct range ranges[] = { { 0x10D50UL, 0x10D65UL }, { 0x118A0UL, 0x118BFUL }, { 0x16E40UL, 0x16E5FUL }, + { 0x16EA0UL, 0x16EB8UL }, { 0x1D400UL, 0x1D419UL }, { 0x1D434UL, 0x1D44DUL }, { 0x1D468UL, 0x1D481UL }, diff --git a/src/libre/class/utf8_M.c b/src/libre/class/utf8_M.c index f0b3e3af6..ce4a80f2b 100644 --- a/src/libre/class/utf8_M.c +++ b/src/libre/class/utf8_M.c @@ -139,7 +139,8 @@ static const struct range ranges[] = { { 0x1A55UL, 0x1A5EUL }, { 0x1A60UL, 0x1A7CUL }, { 0x1A7FUL, 0x1A7FUL }, - { 0x1AB0UL, 0x1ACEUL }, + { 0x1AB0UL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1B00UL, 0x1B04UL }, { 0x1B34UL, 0x1B44UL }, { 0x1B6BUL, 0x1B73UL }, @@ -205,7 +206,7 @@ static const struct range ranges[] = { { 0x10D24UL, 0x10D27UL }, { 0x10D69UL, 0x10D6DUL }, { 0x10EABUL, 0x10EACUL }, - { 0x10EFCUL, 0x10EFFUL }, + { 0x10EFAUL, 0x10EFFUL }, { 0x10F46UL, 0x10F50UL }, { 0x10F82UL, 0x10F85UL }, { 0x11000UL, 0x11002UL }, @@ -267,6 +268,7 @@ static const struct range ranges[] = { { 0x11A47UL, 0x11A47UL }, { 0x11A51UL, 0x11A5BUL }, { 0x11A8AUL, 0x11A99UL }, + { 0x11B60UL, 0x11B67UL }, { 0x11C2FUL, 0x11C36UL }, { 0x11C38UL, 0x11C3FUL }, { 0x11C92UL, 0x11CA7UL }, @@ -321,6 +323,10 @@ static const struct range ranges[] = { { 0x1E2ECUL, 0x1E2EFUL }, { 0x1E4ECUL, 0x1E4EFUL }, { 0x1E5EEUL, 0x1E5EFUL }, + { 0x1E6E3UL, 0x1E6E3UL }, + { 0x1E6E6UL, 0x1E6E6UL }, + { 0x1E6EEUL, 0x1E6EFUL }, + { 0x1E6F5UL, 0x1E6F5UL }, { 0x1E8D0UL, 0x1E8D6UL }, { 0x1E944UL, 0x1E94AUL }, { 0xE0100UL, 0xE01EFUL } diff --git a/src/libre/class/utf8_Mc.c b/src/libre/class/utf8_Mc.c index 77917668c..aa391dcb9 100644 --- a/src/libre/class/utf8_Mc.c +++ b/src/libre/class/utf8_Mc.c @@ -175,6 +175,9 @@ static const struct range ranges[] = { { 0x11A39UL, 0x11A39UL }, { 0x11A57UL, 0x11A58UL }, { 0x11A97UL, 0x11A97UL }, + { 0x11B61UL, 0x11B61UL }, + { 0x11B65UL, 0x11B65UL }, + { 0x11B67UL, 0x11B67UL }, { 0x11C2FUL, 0x11C2FUL }, { 0x11C3EUL, 0x11C3EUL }, { 0x11CA9UL, 0x11CA9UL }, diff --git a/src/libre/class/utf8_Mn.c b/src/libre/class/utf8_Mn.c index c204fa592..4ad1f3ddd 100644 --- a/src/libre/class/utf8_Mn.c +++ b/src/libre/class/utf8_Mn.c @@ -145,7 +145,8 @@ static const struct range ranges[] = { { 0x1A73UL, 0x1A7CUL }, { 0x1A7FUL, 0x1A7FUL }, { 0x1AB0UL, 0x1ABDUL }, - { 0x1ABFUL, 0x1ACEUL }, + { 0x1ABFUL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1B00UL, 0x1B03UL }, { 0x1B34UL, 0x1B34UL }, { 0x1B36UL, 0x1B3AUL }, @@ -227,7 +228,7 @@ static const struct range ranges[] = { { 0x10D24UL, 0x10D27UL }, { 0x10D69UL, 0x10D6DUL }, { 0x10EABUL, 0x10EACUL }, - { 0x10EFCUL, 0x10EFFUL }, + { 0x10EFAUL, 0x10EFFUL }, { 0x10F46UL, 0x10F50UL }, { 0x10F82UL, 0x10F85UL }, { 0x11001UL, 0x11001UL }, @@ -302,6 +303,9 @@ static const struct range ranges[] = { { 0x11A59UL, 0x11A5BUL }, { 0x11A8AUL, 0x11A96UL }, { 0x11A98UL, 0x11A99UL }, + { 0x11B60UL, 0x11B60UL }, + { 0x11B62UL, 0x11B64UL }, + { 0x11B66UL, 0x11B66UL }, { 0x11C30UL, 0x11C36UL }, { 0x11C38UL, 0x11C3DUL }, { 0x11C3FUL, 0x11C3FUL }, @@ -357,6 +361,10 @@ static const struct range ranges[] = { { 0x1E2ECUL, 0x1E2EFUL }, { 0x1E4ECUL, 0x1E4EFUL }, { 0x1E5EEUL, 0x1E5EFUL }, + { 0x1E6E3UL, 0x1E6E3UL }, + { 0x1E6E6UL, 0x1E6E6UL }, + { 0x1E6EEUL, 0x1E6EFUL }, + { 0x1E6F5UL, 0x1E6F5UL }, { 0x1E8D0UL, 0x1E8D6UL }, { 0x1E944UL, 0x1E94AUL }, { 0xE0100UL, 0xE01EFUL } diff --git a/src/libre/class/utf8_N.c b/src/libre/class/utf8_N.c index a4b7fdebc..fa7dcc037 100644 --- a/src/libre/class/utf8_N.c +++ b/src/libre/class/utf8_N.c @@ -119,6 +119,7 @@ static const struct range ranges[] = { { 0x11C50UL, 0x11C6CUL }, { 0x11D50UL, 0x11D59UL }, { 0x11DA0UL, 0x11DA9UL }, + { 0x11DE0UL, 0x11DE9UL }, { 0x11F50UL, 0x11F59UL }, { 0x11FC0UL, 0x11FD4UL }, { 0x12400UL, 0x1246EUL }, @@ -129,6 +130,7 @@ static const struct range ranges[] = { { 0x16B5BUL, 0x16B61UL }, { 0x16D70UL, 0x16D79UL }, { 0x16E80UL, 0x16E96UL }, + { 0x16FF4UL, 0x16FF6UL }, { 0x1CCF0UL, 0x1CCF9UL }, { 0x1D2C0UL, 0x1D2D3UL }, { 0x1D2E0UL, 0x1D2F3UL }, diff --git a/src/libre/class/utf8_Nd.c b/src/libre/class/utf8_Nd.c index 7506f2d2e..19822a94b 100644 --- a/src/libre/class/utf8_Nd.c +++ b/src/libre/class/utf8_Nd.c @@ -60,6 +60,7 @@ static const struct range ranges[] = { { 0x11C50UL, 0x11C59UL }, { 0x11D50UL, 0x11D59UL }, { 0x11DA0UL, 0x11DA9UL }, + { 0x11DE0UL, 0x11DE9UL }, { 0x11F50UL, 0x11F59UL }, { 0x16130UL, 0x16139UL }, { 0x16A60UL, 0x16A69UL }, diff --git a/src/libre/class/utf8_Nl.c b/src/libre/class/utf8_Nl.c index fdf83eb7f..71097a550 100644 --- a/src/libre/class/utf8_Nl.c +++ b/src/libre/class/utf8_Nl.c @@ -14,7 +14,8 @@ static const struct range ranges[] = { { 0x10341UL, 0x10341UL }, { 0x1034AUL, 0x1034AUL }, { 0x103D1UL, 0x103D5UL }, - { 0x12400UL, 0x1246EUL } + { 0x12400UL, 0x1246EUL }, + { 0x16FF4UL, 0x16FF6UL } }; const struct class utf8_Nl = { diff --git a/src/libre/class/utf8_P.c b/src/libre/class/utf8_P.c index 9c506bd13..664a1d28c 100644 --- a/src/libre/class/utf8_P.c +++ b/src/libre/class/utf8_P.c @@ -151,6 +151,7 @@ static const struct range ranges[] = { { 0x10B99UL, 0x10B9CUL }, { 0x10D6EUL, 0x10D6EUL }, { 0x10EADUL, 0x10EADUL }, + { 0x10ED0UL, 0x10ED0UL }, { 0x10F55UL, 0x10F59UL }, { 0x10F86UL, 0x10F89UL }, { 0x11047UL, 0x1104DUL }, diff --git a/src/libre/class/utf8_Po.c b/src/libre/class/utf8_Po.c index ef95b8586..4b20947c0 100644 --- a/src/libre/class/utf8_Po.c +++ b/src/libre/class/utf8_Po.c @@ -146,6 +146,7 @@ static const struct range ranges[] = { { 0x10AF0UL, 0x10AF6UL }, { 0x10B39UL, 0x10B3FUL }, { 0x10B99UL, 0x10B9CUL }, + { 0x10ED0UL, 0x10ED0UL }, { 0x10F55UL, 0x10F59UL }, { 0x10F86UL, 0x10F89UL }, { 0x11047UL, 0x1104DUL }, diff --git a/src/libre/class/utf8_S.c b/src/libre/class/utf8_S.c index 0b69e43fe..9b66a152c 100644 --- a/src/libre/class/utf8_S.c +++ b/src/libre/class/utf8_S.c @@ -75,7 +75,7 @@ static const struct range ranges[] = { { 0x2052UL, 0x2052UL }, { 0x207AUL, 0x207CUL }, { 0x208AUL, 0x208CUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0x2100UL, 0x2101UL }, { 0x2103UL, 0x2106UL }, { 0x2108UL, 0x2109UL }, @@ -103,8 +103,7 @@ static const struct range ranges[] = { { 0x2999UL, 0x29D7UL }, { 0x29DCUL, 0x29FBUL }, { 0x29FEUL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2BFFUL }, + { 0x2B76UL, 0x2BFFUL }, { 0x2CE5UL, 0x2CEAUL }, { 0x2E50UL, 0x2E51UL }, { 0x2E80UL, 0x2E99UL }, @@ -138,9 +137,10 @@ static const struct range ranges[] = { { 0xAB5BUL, 0xAB5BUL }, { 0xAB6AUL, 0xAB6BUL }, { 0xFB29UL, 0xFB29UL }, - { 0xFBB2UL, 0xFBC2UL }, + { 0xFBB2UL, 0xFBD2UL }, { 0xFD40UL, 0xFD4FUL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFD90UL, 0xFD91UL }, + { 0xFDC8UL, 0xFDCFUL }, { 0xFDFCUL, 0xFDFFUL }, { 0xFE62UL, 0xFE62UL }, { 0xFE64UL, 0xFE66UL }, @@ -164,13 +164,17 @@ static const struct range ranges[] = { { 0x10877UL, 0x10878UL }, { 0x10AC8UL, 0x10AC8UL }, { 0x10D8EUL, 0x10D8FUL }, + { 0x10ED1UL, 0x10ED8UL }, { 0x1173FUL, 0x1173FUL }, { 0x11FD5UL, 0x11FF1UL }, { 0x16B3CUL, 0x16B3FUL }, { 0x16B45UL, 0x16B45UL }, { 0x1BC9CUL, 0x1BC9CUL }, { 0x1CC00UL, 0x1CCEFUL }, + { 0x1CCFAUL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEF0UL }, { 0x1CF50UL, 0x1CFC3UL }, { 0x1D000UL, 0x1D0F5UL }, { 0x1D100UL, 0x1D126UL }, @@ -215,11 +219,10 @@ static const struct range ranges[] = { { 0x1F240UL, 0x1F248UL }, { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, - { 0x1F300UL, 0x1F6D7UL }, + { 0x1F300UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -229,16 +232,19 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F8D0UL, 0x1F8D8UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBEFUL } + { 0x1FB94UL, 0x1FBEFUL }, + { 0x1FBFAUL, 0x1FBFAUL } }; const struct class utf8_S = { diff --git a/src/libre/class/utf8_Sc.c b/src/libre/class/utf8_Sc.c index 0effc2b86..3bba263b8 100644 --- a/src/libre/class/utf8_Sc.c +++ b/src/libre/class/utf8_Sc.c @@ -14,7 +14,7 @@ static const struct range ranges[] = { { 0x0BF9UL, 0x0BF9UL }, { 0x0E3FUL, 0x0E3FUL }, { 0x17DBUL, 0x17DBUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0xA838UL, 0xA838UL }, { 0xFDFCUL, 0xFDFCUL }, { 0xFE69UL, 0xFE69UL }, diff --git a/src/libre/class/utf8_Sharada.c b/src/libre/class/utf8_Sharada.c index e51b2edff..dbfa21d3f 100644 --- a/src/libre/class/utf8_Sharada.c +++ b/src/libre/class/utf8_Sharada.c @@ -3,7 +3,8 @@ #include "class.h" static const struct range ranges[] = { - { 0x11180UL, 0x111DFUL } + { 0x11180UL, 0x111DFUL }, + { 0x11B60UL, 0x11B67UL } }; const struct class utf8_Sharada = { diff --git a/src/libre/class/utf8_Sidetic.c b/src/libre/class/utf8_Sidetic.c new file mode 100644 index 000000000..ed53b543d --- /dev/null +++ b/src/libre/class/utf8_Sidetic.c @@ -0,0 +1,13 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x10940UL, 0x10959UL } +}; + +const struct class utf8_Sidetic = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_Sm.c b/src/libre/class/utf8_Sm.c index 9c39b8003..e157e5f8c 100644 --- a/src/libre/class/utf8_Sm.c +++ b/src/libre/class/utf8_Sm.c @@ -57,6 +57,7 @@ static const struct range ranges[] = { { 0xFFE2UL, 0xFFE2UL }, { 0xFFE9UL, 0xFFECUL }, { 0x10D8EUL, 0x10D8FUL }, + { 0x1CEF0UL, 0x1CEF0UL }, { 0x1D6C1UL, 0x1D6C1UL }, { 0x1D6DBUL, 0x1D6DBUL }, { 0x1D6FBUL, 0x1D6FBUL }, @@ -67,7 +68,8 @@ static const struct range ranges[] = { { 0x1D789UL, 0x1D789UL }, { 0x1D7A9UL, 0x1D7A9UL }, { 0x1D7C3UL, 0x1D7C3UL }, - { 0x1EEF0UL, 0x1EEF1UL } + { 0x1EEF0UL, 0x1EEF1UL }, + { 0x1F8D0UL, 0x1F8D8UL } }; const struct class utf8_Sm = { diff --git a/src/libre/class/utf8_So.c b/src/libre/class/utf8_So.c index d7e608ca4..93ca7e36b 100644 --- a/src/libre/class/utf8_So.c +++ b/src/libre/class/utf8_So.c @@ -82,8 +82,7 @@ static const struct range ranges[] = { { 0x2B00UL, 0x2B2FUL }, { 0x2B45UL, 0x2B46UL }, { 0x2B4DUL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2BFFUL }, + { 0x2B76UL, 0x2BFFUL }, { 0x2CE5UL, 0x2CEAUL }, { 0x2E50UL, 0x2E51UL }, { 0x2E80UL, 0x2E99UL }, @@ -111,8 +110,10 @@ static const struct range ranges[] = { { 0xA836UL, 0xA837UL }, { 0xA839UL, 0xA839UL }, { 0xAA77UL, 0xAA79UL }, + { 0xFBC3UL, 0xFBD2UL }, { 0xFD40UL, 0xFD4FUL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFD90UL, 0xFD91UL }, + { 0xFDC8UL, 0xFDCFUL }, { 0xFDFDUL, 0xFDFFUL }, { 0xFFE4UL, 0xFFE4UL }, { 0xFFE8UL, 0xFFE8UL }, @@ -126,6 +127,7 @@ static const struct range ranges[] = { { 0x101D0UL, 0x101FCUL }, { 0x10877UL, 0x10878UL }, { 0x10AC8UL, 0x10AC8UL }, + { 0x10ED1UL, 0x10ED8UL }, { 0x1173FUL, 0x1173FUL }, { 0x11FD5UL, 0x11FDCUL }, { 0x11FE1UL, 0x11FF1UL }, @@ -133,7 +135,10 @@ static const struct range ranges[] = { { 0x16B45UL, 0x16B45UL }, { 0x1BC9CUL, 0x1BC9CUL }, { 0x1CC00UL, 0x1CCEFUL }, + { 0x1CCFAUL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEEFUL }, { 0x1CF50UL, 0x1CFC3UL }, { 0x1D000UL, 0x1D0F5UL }, { 0x1D100UL, 0x1D126UL }, @@ -166,11 +171,10 @@ static const struct range ranges[] = { { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, { 0x1F300UL, 0x1F3FAUL }, - { 0x1F400UL, 0x1F6D7UL }, + { 0x1F400UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -180,16 +184,18 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBEFUL } + { 0x1FB94UL, 0x1FBEFUL }, + { 0x1FBFAUL, 0x1FBFAUL } }; const struct class utf8_So = { diff --git a/src/libre/class/utf8_Tai_Yo.c b/src/libre/class/utf8_Tai_Yo.c new file mode 100644 index 000000000..643d73fe2 --- /dev/null +++ b/src/libre/class/utf8_Tai_Yo.c @@ -0,0 +1,15 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6F5UL }, + { 0x1E6FEUL, 0x1E6FFUL } +}; + +const struct class utf8_Tai_Yo = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_Tangut.c b/src/libre/class/utf8_Tangut.c index cbd82c907..6bf5e5687 100644 --- a/src/libre/class/utf8_Tangut.c +++ b/src/libre/class/utf8_Tangut.c @@ -4,9 +4,9 @@ static const struct range ranges[] = { { 0x16FE0UL, 0x16FE0UL }, - { 0x17000UL, 0x187F7UL }, - { 0x18800UL, 0x18AFFUL }, - { 0x18D00UL, 0x18D08UL } + { 0x17000UL, 0x18AFFUL }, + { 0x18D00UL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL } }; const struct class utf8_Tangut = { diff --git a/src/libre/class/utf8_Telugu.c b/src/libre/class/utf8_Telugu.c index a2c65ae49..1b80f5e1f 100644 --- a/src/libre/class/utf8_Telugu.c +++ b/src/libre/class/utf8_Telugu.c @@ -12,7 +12,7 @@ static const struct range ranges[] = { { 0x0C4AUL, 0x0C4DUL }, { 0x0C55UL, 0x0C56UL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C63UL }, { 0x0C66UL, 0x0C6FUL }, { 0x0C77UL, 0x0C7FUL } diff --git a/src/libre/class/utf8_Tolong_Siki.c b/src/libre/class/utf8_Tolong_Siki.c new file mode 100644 index 000000000..e670a957b --- /dev/null +++ b/src/libre/class/utf8_Tolong_Siki.c @@ -0,0 +1,14 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x11DB0UL, 0x11DDBUL }, + { 0x11DE0UL, 0x11DE9UL } +}; + +const struct class utf8_Tolong_Siki = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_assigned.c b/src/libre/class/utf8_assigned.c index d38345570..32b689a26 100644 --- a/src/libre/class/utf8_assigned.c +++ b/src/libre/class/utf8_assigned.c @@ -24,8 +24,7 @@ static const struct range ranges[] = { { 0x0840UL, 0x085BUL }, { 0x085EUL, 0x085EUL }, { 0x0860UL, 0x086AUL }, - { 0x0870UL, 0x088EUL }, - { 0x0890UL, 0x0891UL }, + { 0x0870UL, 0x0891UL }, { 0x0897UL, 0x0983UL }, { 0x0985UL, 0x098CUL }, { 0x098FUL, 0x0990UL }, @@ -109,7 +108,7 @@ static const struct range ranges[] = { { 0x0C4AUL, 0x0C4DUL }, { 0x0C55UL, 0x0C56UL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C63UL }, { 0x0C66UL, 0x0C6FUL }, { 0x0C77UL, 0x0C8CUL }, @@ -121,7 +120,7 @@ static const struct range ranges[] = { { 0x0CC6UL, 0x0CC8UL }, { 0x0CCAUL, 0x0CCDUL }, { 0x0CD5UL, 0x0CD6UL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE3UL }, { 0x0CE6UL, 0x0CEFUL }, { 0x0CF1UL, 0x0CF3UL }, @@ -216,7 +215,8 @@ static const struct range ranges[] = { { 0x1A7FUL, 0x1A89UL }, { 0x1A90UL, 0x1A99UL }, { 0x1AA0UL, 0x1AADUL }, - { 0x1AB0UL, 0x1ACEUL }, + { 0x1AB0UL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1B00UL, 0x1B4CUL }, { 0x1B4EUL, 0x1BF3UL }, { 0x1BFCUL, 0x1C37UL }, @@ -245,14 +245,13 @@ static const struct range ranges[] = { { 0x2066UL, 0x2071UL }, { 0x2074UL, 0x208EUL }, { 0x2090UL, 0x209CUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0x20D0UL, 0x20F0UL }, { 0x2100UL, 0x218BUL }, { 0x2190UL, 0x2429UL }, { 0x2440UL, 0x244AUL }, { 0x2460UL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2CF3UL }, + { 0x2B76UL, 0x2CF3UL }, { 0x2CF9UL, 0x2D25UL }, { 0x2D27UL, 0x2D27UL }, { 0x2D2DUL, 0x2D2DUL }, @@ -284,11 +283,8 @@ static const struct range ranges[] = { { 0xA490UL, 0xA4C6UL }, { 0xA4D0UL, 0xA62BUL }, { 0xA640UL, 0xA6F7UL }, - { 0xA700UL, 0xA7CDUL }, - { 0xA7D0UL, 0xA7D1UL }, - { 0xA7D3UL, 0xA7D3UL }, - { 0xA7D5UL, 0xA7DCUL }, - { 0xA7F2UL, 0xA82CUL }, + { 0xA700UL, 0xA7DCUL }, + { 0xA7F1UL, 0xA82CUL }, { 0xA830UL, 0xA839UL }, { 0xA840UL, 0xA877UL }, { 0xA880UL, 0xA8C5UL }, @@ -328,10 +324,7 @@ static const struct range ranges[] = { { 0xFB3EUL, 0xFB3EUL }, { 0xFB40UL, 0xFB41UL }, { 0xFB43UL, 0xFB44UL }, - { 0xFB46UL, 0xFBC2UL }, - { 0xFBD3UL, 0xFD8FUL }, - { 0xFD92UL, 0xFDC7UL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFB46UL, 0xFDCFUL }, { 0xFDF0UL, 0xFE19UL }, { 0xFE20UL, 0xFE52UL }, { 0xFE54UL, 0xFE66UL }, @@ -402,7 +395,7 @@ static const struct range ranges[] = { { 0x108F4UL, 0x108F5UL }, { 0x108FBUL, 0x1091BUL }, { 0x1091FUL, 0x10939UL }, - { 0x1093FUL, 0x1093FUL }, + { 0x1093FUL, 0x10959UL }, { 0x10980UL, 0x109B7UL }, { 0x109BCUL, 0x109CFUL }, { 0x109D2UL, 0x10A03UL }, @@ -434,8 +427,9 @@ static const struct range ranges[] = { { 0x10E80UL, 0x10EA9UL }, { 0x10EABUL, 0x10EADUL }, { 0x10EB0UL, 0x10EB1UL }, - { 0x10EC2UL, 0x10EC4UL }, - { 0x10EFCUL, 0x10F27UL }, + { 0x10EC2UL, 0x10EC7UL }, + { 0x10ED0UL, 0x10ED8UL }, + { 0x10EFAUL, 0x10F27UL }, { 0x10F30UL, 0x10F59UL }, { 0x10F70UL, 0x10F89UL }, { 0x10FB0UL, 0x10FCBUL }, @@ -518,6 +512,7 @@ static const struct range ranges[] = { { 0x11A50UL, 0x11AA2UL }, { 0x11AB0UL, 0x11AF8UL }, { 0x11B00UL, 0x11B09UL }, + { 0x11B60UL, 0x11B67UL }, { 0x11BC0UL, 0x11BE1UL }, { 0x11BF0UL, 0x11BF9UL }, { 0x11C00UL, 0x11C08UL }, @@ -540,6 +535,8 @@ static const struct range ranges[] = { { 0x11D90UL, 0x11D91UL }, { 0x11D93UL, 0x11D98UL }, { 0x11DA0UL, 0x11DA9UL }, + { 0x11DB0UL, 0x11DDBUL }, + { 0x11DE0UL, 0x11DE9UL }, { 0x11EE0UL, 0x11EF8UL }, { 0x11F00UL, 0x11F10UL }, { 0x11F12UL, 0x11F3AUL }, @@ -569,16 +566,18 @@ static const struct range ranges[] = { { 0x16B7DUL, 0x16B8FUL }, { 0x16D40UL, 0x16D79UL }, { 0x16E40UL, 0x16E9AUL }, + { 0x16EA0UL, 0x16EB8UL }, + { 0x16EBBUL, 0x16ED3UL }, { 0x16F00UL, 0x16F4AUL }, { 0x16F4FUL, 0x16F87UL }, { 0x16F8FUL, 0x16F9FUL }, { 0x16FE0UL, 0x16FE4UL }, - { 0x16FF0UL, 0x16FF1UL }, + { 0x16FF0UL, 0x16FF6UL }, { 0x17000UL, 0x17000UL }, - { 0x187F7UL, 0x187F7UL }, - { 0x18800UL, 0x18CD5UL }, + { 0x187FFUL, 0x18CD5UL }, { 0x18CFFUL, 0x18D00UL }, - { 0x18D08UL, 0x18D08UL }, + { 0x18D1EUL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL }, { 0x1AFF0UL, 0x1AFF3UL }, { 0x1AFF5UL, 0x1AFFBUL }, { 0x1AFFDUL, 0x1AFFEUL }, @@ -593,8 +592,10 @@ static const struct range ranges[] = { { 0x1BC80UL, 0x1BC88UL }, { 0x1BC90UL, 0x1BC99UL }, { 0x1BC9CUL, 0x1BCA3UL }, - { 0x1CC00UL, 0x1CCF9UL }, + { 0x1CC00UL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEF0UL }, { 0x1CF00UL, 0x1CF2DUL }, { 0x1CF30UL, 0x1CF46UL }, { 0x1CF50UL, 0x1CFC3UL }, @@ -648,6 +649,9 @@ static const struct range ranges[] = { { 0x1E4D0UL, 0x1E4F9UL }, { 0x1E5D0UL, 0x1E5FAUL }, { 0x1E5FFUL, 0x1E5FFUL }, + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6F5UL }, + { 0x1E6FEUL, 0x1E6FFUL }, { 0x1E7E0UL, 0x1E7E6UL }, { 0x1E7E8UL, 0x1E7EBUL }, { 0x1E7EDUL, 0x1E7EEUL }, @@ -705,11 +709,10 @@ static const struct range ranges[] = { { 0x1F240UL, 0x1F248UL }, { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, - { 0x1F300UL, 0x1F6D7UL }, + { 0x1F300UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -719,24 +722,25 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F8D0UL, 0x1F8D8UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBF9UL }, + { 0x1FB94UL, 0x1FBFAUL }, { 0x20000UL, 0x20000UL }, { 0x2A6DFUL, 0x2A6DFUL }, { 0x2A700UL, 0x2A700UL }, - { 0x2B739UL, 0x2B739UL }, - { 0x2B740UL, 0x2B740UL }, + { 0x2B73FUL, 0x2B740UL }, { 0x2B81DUL, 0x2B81DUL }, { 0x2B820UL, 0x2B820UL }, - { 0x2CEA1UL, 0x2CEA1UL }, + { 0x2CEADUL, 0x2CEADUL }, { 0x2CEB0UL, 0x2CEB0UL }, { 0x2EBE0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EBF0UL }, @@ -745,7 +749,8 @@ static const struct range ranges[] = { { 0x30000UL, 0x30000UL }, { 0x3134AUL, 0x3134AUL }, { 0x31350UL, 0x31350UL }, - { 0x323AFUL, 0x323AFUL }, + { 0x323AFUL, 0x323B0UL }, + { 0x33479UL, 0x33479UL }, { 0xE0001UL, 0xE0001UL }, { 0xE0020UL, 0xE007FUL }, { 0xE0100UL, 0xE01EFUL }, diff --git a/src/libre/class_name.c b/src/libre/class_name.c index d96a1d610..e4cb7cc0f 100644 --- a/src/libre/class_name.c +++ b/src/libre/class_name.c @@ -50,6 +50,7 @@ static struct { { &utf8_Bassa_Vah, "Bassa Vah" }, { &utf8_Batak, "Batak" }, { &utf8_Bengali, "Bengali" }, + { &utf8_Beria_Erfe, "Beria Erfe" }, { &utf8_Bhaiksuki, "Bhaiksuki" }, { &utf8_Bopomofo, "Bopomofo" }, { &utf8_Brahmi, "Brahmi" }, @@ -174,6 +175,7 @@ static struct { { &utf8_Sharada, "Sharada" }, { &utf8_Shavian, "Shavian" }, { &utf8_Siddham, "Siddham" }, + { &utf8_Sidetic, "Sidetic" }, { &utf8_SignWriting, "SignWriting" }, { &utf8_Sinhala, "Sinhala" }, { &utf8_Sogdian, "Sogdian" }, @@ -188,6 +190,7 @@ static struct { { &utf8_Tai_Le, "Tai Le" }, { &utf8_Tai_Tham, "Tai Tham" }, { &utf8_Tai_Viet, "Tai Viet" }, + { &utf8_Tai_Yo, "Tai Yo" }, { &utf8_Takri, "Takri" }, { &utf8_Tamil, "Tamil" }, { &utf8_Tangsa, "Tangsa" }, @@ -199,6 +202,7 @@ static struct { { &utf8_Tifinagh, "Tifinagh" }, { &utf8_Tirhuta, "Tirhuta" }, { &utf8_Todhri, "Todhri" }, + { &utf8_Tolong_Siki, "Tolong Siki" }, { &utf8_Toto, "Toto" }, { &utf8_Tulu_Tigalari, "Tulu Tigalari" }, { &utf8_Ugaritic, "Ugaritic" }, From 4155d56a3cf65ffd026aa6d22cae510147652001 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 27 Aug 2024 14:08:39 -0400 Subject: [PATCH 30/80] experimental: Add eager outputs, similar to endids but eagerly matched. When combining several unanchored regexes it becomes VERY expensive to handle combinations of matches via the end state -- essentially, the whole reachable DFA gets separate matching and non-matching copies for each pattern, leading to a DFA whose size is proportional to the number of *possible combinations* of matches. With eager outputs, we can set a flag for matching as we reach the end of the original pattern (before looping back and possibly also matching other patterns), which keeps the state count from blowing up in fsm_determinise. To see how much difference this makes, the test tests/eager_output/run7 combines 26 different patterns. It should finish very quickly (~50 msec, just now). Try running it with `env FORCE_ENDIDS=N` for N increasing from 4 to 26. Around 10-11 it will start taking several seconds, and memory usage will roughly double with each step. This PR adds `fsm_union_repeated_pattern_group`, a variant of `fsm_union_array` that combines a set of DFAs into a single NFA, but correctly handles a mix of anchored and unanchored ends without the state count blowing up. It currently needs flags passed in for each fsm indicating whether the start and/or end are anchored, and there is a hacky special case that removes self-edges from states with eager outputs and instead connects them to a single overall unanchored end loop. I haven't yet figured out how to handle this properly in the general case, but it works for this specific use case, provided all the DFAs are combined at once. (Combining multiple DFAs each produced by determinising fsm_union_repeated_pattern_group's result probably won't work correctly.) I have tried detecting and ignoring those edges in fsm_determinise, after epsilon removal, but so far either it still causes the graph size to blow up or subtly breaks something else. This is still experimental, and the code generation for `-lc` here is quite hacky -- it expects the caller to define a `FSM_SET_EAGER_OUTPUT` macro, since the code generation interface doesn't define where the match info will go yet. A later PR will add a new code generation mode with better support for eager outputs, and I plan to eventually integrate this better with rx, AMBIG_MULTIPLE, and so on. (This squashes down a couple false starts.) --- Makefile | 1 + fuzz/target.c | 524 +++++++++++++++++- include/fsm/bool.h | 10 + include/fsm/fsm.h | 46 ++ include/fsm/print.h | 3 + src/libfsm/Makefile | 1 + src/libfsm/clone.c | 38 ++ src/libfsm/consolidate.c | 46 ++ src/libfsm/determinise.c | 131 ++++- src/libfsm/determinise_internal.h | 10 +- src/libfsm/eager_output.c | 403 ++++++++++++++ src/libfsm/eager_output.h | 46 ++ src/libfsm/epsilons.c | 147 ++++- src/libfsm/exec.c | 55 +- src/libfsm/fsm.c | 11 + src/libfsm/internal.h | 5 + src/libfsm/libfsm.syms | 11 + src/libfsm/merge.c | 42 ++ src/libfsm/minimise.c | 79 ++- src/libfsm/print/c.c | 13 + src/libfsm/print/ir.c | 35 ++ src/libfsm/print/ir.h | 5 + src/libfsm/state.c | 8 + src/libfsm/union.c | 233 ++++++++ tests/eager_output/Makefile | 22 + tests/eager_output/eager_output1.c | 12 + tests/eager_output/eager_output2.c | 17 + tests/eager_output/eager_output3.c | 16 + tests/eager_output/eager_output4.c | 13 + tests/eager_output/eager_output5.c | 14 + tests/eager_output/eager_output6.c | 34 ++ tests/eager_output/eager_output7.c | 103 ++++ tests/eager_output/eager_output_at_start.c | 12 + tests/eager_output/eager_output_fr1.c | 13 + tests/eager_output/eager_output_fr2.c | 13 + tests/eager_output/eager_output_fr3.c | 13 + .../eager_output_mixed_anchored_unanchored.c | 46 ++ tests/eager_output/utils.c | 278 ++++++++++ tests/eager_output/utils.h | 64 +++ 39 files changed, 2537 insertions(+), 36 deletions(-) create mode 100644 src/libfsm/eager_output.c create mode 100644 src/libfsm/eager_output.h create mode 100644 tests/eager_output/Makefile create mode 100644 tests/eager_output/eager_output1.c create mode 100644 tests/eager_output/eager_output2.c create mode 100644 tests/eager_output/eager_output3.c create mode 100644 tests/eager_output/eager_output4.c create mode 100644 tests/eager_output/eager_output5.c create mode 100644 tests/eager_output/eager_output6.c create mode 100644 tests/eager_output/eager_output7.c create mode 100644 tests/eager_output/eager_output_at_start.c create mode 100644 tests/eager_output/eager_output_fr1.c create mode 100644 tests/eager_output/eager_output_fr2.c create mode 100644 tests/eager_output/eager_output_fr3.c create mode 100644 tests/eager_output/eager_output_mixed_anchored_unanchored.c create mode 100644 tests/eager_output/utils.c create mode 100644 tests/eager_output/utils.h diff --git a/Makefile b/Makefile index b9e196d7b..ad16b1ec7 100644 --- a/Makefile +++ b/Makefile @@ -118,6 +118,7 @@ SUBDIR += tests/equals SUBDIR += tests/subtract SUBDIR += tests/detect_required SUBDIR += tests/determinise +SUBDIR += tests/eager_output SUBDIR += tests/endids SUBDIR += tests/epsilons SUBDIR += tests/fsm diff --git a/fuzz/target.c b/fuzz/target.c index 543891bb9..d56a9bf82 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -26,10 +26,21 @@ /* 10 seconds */ #define TIMEOUT_USEC (10ULL * 1000 * 1000) +static bool verbosity_checked = false; +static bool verbose = false; + +#define LOG(...) \ + do { \ + if (verbose) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) \ + enum run_mode { MODE_DEFAULT, MODE_SHUFFLE_MINIMISE, MODE_ALL_PRINT_FUNCTIONS, + MODE_EAGER_OUTPUT, }; @@ -344,6 +355,508 @@ fuzz_all_print_functions(FILE *f, const char *pattern, bool det, bool min, const return EXIT_SUCCESS; } +#define MAX_PATTERNS 4 +struct eager_output_cb_info { + size_t used; + fsm_output_id_t ids[MAX_PATTERNS]; +}; + +static void +reset_eager_output_info(struct eager_output_cb_info *info) +{ + info->used = 0; +} + +struct feo_env { + bool ok; + size_t pattern_count; + size_t fsm_count; + size_t max_match_count; + size_t max_steps; + + char *patterns[MAX_PATTERNS]; + struct fsm *fsms[MAX_PATTERNS]; + struct fsm *combined; + + /* which pattern is being used for generation, (size_t)-1 for combined */ + size_t current_pattern; + + struct eager_output_cb_info outputs; + struct eager_output_cb_info outputs_combined; +}; + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque) +{ + struct eager_output_cb_info *info = (struct eager_output_cb_info *)opaque; + + for (size_t i = 0; i < info->used; i++) { + if (info->ids[i] == id) { + return; /* already present */ + } + } + + assert(info->used < MAX_PATTERNS); + info->ids[info->used++] = id; +} + +static enum fsm_generate_matches_cb_res +gen_combined_check_individual_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +static enum fsm_generate_matches_cb_res +gen_individual_check_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque); + +#define DEF_MAX_STEPS 100000 +#define DEF_MAX_MATCH_COUNT 1000 + +/* This isn't part of the public interface, per se. */ +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +static int +fuzz_eager_output(const uint8_t *data, size_t size) +{ + struct feo_env env = { + .ok = true, + .pattern_count = 0, + .max_steps = DEF_MAX_STEPS, + .max_match_count = DEF_MAX_MATCH_COUNT, + }; + + { + const char *steps = getenv("STEPS"); + const char *matches = getenv("MATCHES"); + if (steps != NULL) { + env.max_steps = strtoul(steps, NULL, 10); + assert(env.max_steps > 0); + } + if (matches != NULL) { + env.max_match_count = strtoul(matches, NULL, 10); + assert(env.max_match_count > 0); + } + } + + int ret = 0; + + size_t max_pattern_length = 0; + + /* chop data into a series of patterns */ + { + size_t prev = 0; + size_t offset = 0; + + /* Patterns with lots of '.' can take a while to determinise. + * That slows down fuzzer coverage, but isn't interesting here. */ + size_t dots = 0; + + while (offset < size && env.pattern_count < MAX_PATTERNS) { +#define MAX_DOTS 4 + if (data[offset] == '.') { dots++; } + + if (data[offset] == '\0' || data[offset] == '\n' || offset == size - 1) { + size_t len = offset - prev; + + if (dots > MAX_DOTS) { + /* ignored */ + prev = offset; + } else if (len > 0) { + char *pattern = malloc(len + 1); + assert(pattern != NULL); + + memcpy(pattern, &data[prev], len); + if (len > 0 && pattern[len] == '\n') { + len--; /* drop trailing newline */ + } + pattern[len] = '\0'; + bool keep = true; + + if (len > 0) { + for (size_t i = 0; i < len - 1; i++) { + if (pattern[i] == '\\' && pattern[i + 1] == 'x') { + /* ignore unhandled parser errors from "\x", see #386 */ + keep = false; + } + } + } + + if (keep) { + env.patterns[env.pattern_count++] = pattern; + + if (len > max_pattern_length) { + max_pattern_length = len; + } + } else { + free(pattern); + } + prev = offset; + dots = 0; + } + } + + offset++; + } + } + + struct re_anchoring_info anchorage[MAX_PATTERNS] = {0}; + + /* for each pattern, attempt to compile to a DFA */ + for (size_t p_i = 0; p_i < env.pattern_count; p_i++) { + const char *p = env.patterns[p_i]; + + if (!re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL, &anchorage[p_i])) { + continue; /* unsupported regex */ + } + + p = env.patterns[p_i]; + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + + LOG("%s: pattern %zd: '%s' => %p\n", __func__, p_i, env.patterns[p_i], (void *)fsm); + + if (fsm == NULL) { + continue; /* invalid regex */ + } + + const fsm_output_id_t endid = (fsm_output_id_t)p_i; + ret = fsm_seteageroutputonends(fsm, endid); + assert(ret == 1); + + if (verbose) { + fprintf(stderr, "==== pattern %zd, pre det\n", p_i); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(fsm); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(fsm, i)); + } + } + + ret = fsm_determinise(fsm); + assert(ret == 1); + + ret = fsm_minimise(fsm); + assert(ret == 1); + + fsm_state_t start; + if (!fsm_getstart(fsm, &start)) { + fsm_free(fsm); + continue; + } + + if (verbose) { + fprintf(stderr, "==== pattern %zd, post det\n", p_i); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(fsm); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(fsm, i)); + } + } + + fsm_eager_output_set_cb(fsm, append_eager_output_cb, &env.outputs); + env.fsms[env.fsm_count++] = fsm; + } + + /* don't bother checking combined behavior unless there's multiple DFAs */ + if (env.fsm_count < 2) { goto cleanup; } + + /* copy and combine fsms into one DFA */ + { + size_t used = 0; + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + + for (size_t i = 0; i < env.fsm_count; i++) { + /* there can be gaps, fsms[] lines up with patterns[] */ + if (env.fsms[i] == NULL) { continue; } + + fsm_state_t start; + if (!fsm_getstart(env.fsms[i], &start)) { + assert(!"hit"); + } + + struct fsm *cp = fsm_clone(env.fsms[i]); + assert(cp != NULL); + + if (verbose) { + fprintf(stderr, "==== cp %zd\n", i); + fsm_dump(stderr, cp); + fsm_eager_output_dump(stderr, cp); + fprintf(stderr, "====\n"); + + fsm_state_t c = fsm_countstates(cp); + for (fsm_state_t i = 0; i < c; i++) { + fprintf(stderr, "-- %d: end? %d\n", i, fsm_isend(cp, i)); + } + } + + entries[used].fsm = cp; + entries[used].anchored_start = anchorage[i].start; + entries[used].anchored_end = anchorage[i].end; + used++; + } + + if (used == 0) { + goto cleanup; /* nothing to do */ + } + + /* consumes entries[] */ + struct fsm *fsm = fsm_union_repeated_pattern_group(used, entries, NULL); + assert(fsm != NULL); + + if (verbose) { + fprintf(stderr, "==== combined (pre-det)\n"); + fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + if (!fsm_determinise(fsm)) { + assert(!"failed to determinise"); + } + + if (!fsm_minimise(fsm)) { + assert(!"failed to minimise"); + } + + LOG("%s: combined state_count %d\n", __func__, fsm_countstates(fsm)); + env.combined = fsm; + /* fsm_eager_output_set_cb(fsm, append_eager_output_cb, &env.outputs_combined); */ + + if (verbose) { + fprintf(stderr, "==== combined\n"); + fsm_dump(stderr, env.combined); + fsm_eager_output_dump(stderr, env.combined); + fprintf(stderr, "====\n"); + } + + } + + /* Use fsm_generate_matches to check for matches that got lost + * and false positives introduced while combining the DFAs. + * Use the combined DFA to generate matches, check that the + * match behavior agrees with the individual DFA copies. */ + env.current_pattern = (size_t)-1; + if (!fsm_generate_matches(env.combined, max_pattern_length, gen_combined_check_individual_cb, &env)) { + goto cleanup; + } + + if (!env.ok) { goto cleanup; } + + /* Likewise, use every individual DFA to generate matches and */ + /* check behavior against the combined DFA. */ + for (size_t i = 0; i < env.pattern_count; i++) { + env.current_pattern = i; + if (!fsm_generate_matches(env.combined, max_pattern_length, gen_individual_check_combined_cb, &env)) { + goto cleanup; + } + } + + ret = env.ok ? EXIT_SUCCESS : EXIT_FAILURE; +cleanup: + for (size_t i = 0; i < MAX_PATTERNS; i++) { + if (env.patterns[i] != NULL) { + free(env.patterns[i]); + env.patterns[i] = NULL; + } + if (env.fsms[i] != NULL) { + fsm_free(env.fsms[i]); + } + } + if (env.combined != NULL) { + fsm_free(env.combined); + } + + return ret; +} + +static int +cmp_output_id(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static bool +match_input_get_eager_outputs(struct fsm *fsm, const char *input, size_t input_length, + struct eager_output_cb_info *dst) +{ + (void)input_length; + fsm_state_t end; + + reset_eager_output_info(dst); + + fsm_eager_output_set_cb(fsm, append_eager_output_cb, dst); + const int ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + if (ret == 0) { + return false; /* no match */ + } else { + assert(ret == 1); /* match */ + } + + /* sort the IDs, to make comparison cheaper */ + qsort(dst->ids, dst->used, sizeof(dst->ids[0]), cmp_output_id); + return true; /* match */ +} + +/* For a given matching input generated by the combined DFA, check that + * only the expected individual source DFAs match. */ +static enum fsm_generate_matches_cb_res +gen_combined_check_individual_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + (void)fsm; + (void)depth; + (void)end_state; + + struct feo_env *env = opaque; + assert(env->current_pattern == (size_t)-1); + + if (match_count > env->max_match_count) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + + /* execute, to set eager outputs */ + if (!match_input_get_eager_outputs(env->combined, input, input_length, &env->outputs_combined)) { + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + size_t individual_outputs_used = 0; + fsm_output_id_t individual_outputs[MAX_PATTERNS]; + + for (size_t i = 0; i < env->pattern_count; i++) { + struct fsm *fsm = env->fsms[i]; + if (fsm == NULL) { continue; } + + if (!match_input_get_eager_outputs(fsm, input, input_length, &env->outputs)) { + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; + } + + if (env->outputs.used > 0) { + assert(env->outputs.used == 1); + individual_outputs[individual_outputs_used++] = env->outputs.ids[0]; + } + } + + bool match = true; + if (env->outputs_combined.used != individual_outputs_used) { + match = false; + } + + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + const fsm_output_id_t cur = env->outputs_combined.ids[cmb_i]; + assert(env->fsms[cmb_i] != NULL); + bool found = false; + for (size_t i = 0; i < individual_outputs_used; i++) { + if (individual_outputs[i] == cur) { + found = true; + break; + } + } + if (!found) { + match = false; + break; + } + } + + if (!match) { + fprintf(stderr, "%s: combined <-> individual mismatch for input '%s'(%zd)!\n", __func__, input, input_length); + + fprintf(stderr, "-- combined: %zu IDs:", env->outputs_combined.used); + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + fprintf(stderr, " %d", env->outputs_combined.ids[cmb_i]); + } + fprintf(stderr, "\n"); + fprintf(stderr, "-- individiual: %zu IDs:", individual_outputs_used); + for (size_t i = 0; i < individual_outputs_used; i++) { + fprintf(stderr, " %d", individual_outputs[i]); + } + fprintf(stderr, "\n"); + goto fail; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; + +fail: + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; +} + +/* For a given matching input generated by one of the source DFAs, check that + * the combined DFA also matches, and that the only other source DFAs that match + * are ones that should according to the combined DFA. */ +static enum fsm_generate_matches_cb_res +gen_individual_check_combined_cb(const struct fsm *fsm, + size_t depth, size_t match_count, size_t steps, + const char *input, size_t input_length, + fsm_state_t end_state, void *opaque) +{ + (void)fsm; + (void)depth; + (void)end_state; + + struct feo_env *env = opaque; + assert(env->current_pattern < env->pattern_count); + if (match_count > env->max_match_count) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + if (steps > env->max_steps) { return FSM_GENERATE_MATCHES_CB_RES_HALT; } + + struct fsm *cur_fsm = env->fsms[env->current_pattern]; + if (cur_fsm == NULL) { return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; } + + /* execute, to set eager outputs */ + if (!match_input_get_eager_outputs(cur_fsm, input, input_length, &env->outputs)) { + goto fail; + } + if (!match_input_get_eager_outputs(env->combined, input, input_length, &env->outputs_combined)) { + goto fail; + } + + assert(env->outputs.used == 1); + + bool found = false; + for (size_t i = 0; i < env->outputs_combined.used; i++) { + if (env->outputs_combined.ids[i] == env->outputs.ids[0]) { + found = true; + break; + } + } + + if (!found) { + fprintf(stderr, "%s: combined <-> individual mismatch for input '%s'(%zd)!\n", __func__, input, input_length); + + fprintf(stderr, "-- combined: %zu IDs:", env->outputs_combined.used); + for (size_t cmb_i = 0; cmb_i < env->outputs_combined.used; cmb_i++) { + fprintf(stderr, " %d", env->outputs_combined.ids[cmb_i]); + } + fprintf(stderr, "\n"); + fprintf(stderr, "-- pattern %zd: %zu IDs:", env->current_pattern, env->outputs.used); + for (size_t i = 0; i < env->outputs.used; i++) { + fprintf(stderr, " %d", env->outputs.ids[i]); + } + fprintf(stderr, "\n"); + goto fail; + } + + return FSM_GENERATE_MATCHES_CB_RES_CONTINUE; + +fail: + env->ok = false; + return FSM_GENERATE_MATCHES_CB_RES_HALT; +} +#undef MAX_PATTERNS + #define MAX_FUZZER_DATA (64 * 1024) static uint8_t data_buf[MAX_FUZZER_DATA + 1]; @@ -358,6 +871,7 @@ get_run_mode(void) switch (mode[0]) { case 'm': return MODE_SHUFFLE_MINIMISE; case 'p': return MODE_ALL_PRINT_FUNCTIONS; + case 'E': return MODE_EAGER_OUTPUT; case 'd': default: return MODE_DEFAULT; @@ -373,6 +887,11 @@ harness_fuzzer_target(const uint8_t *data, size_t size) return EXIT_SUCCESS; } + if (!verbosity_checked) { + verbosity_checked = true; + verbose = getenv("VERBOSE") != NULL; + } + /* Ensure that input is '\0'-terminated. */ if (size > MAX_FUZZER_DATA) { size = MAX_FUZZER_DATA; @@ -392,6 +911,9 @@ harness_fuzzer_target(const uint8_t *data, size_t size) case MODE_SHUFFLE_MINIMISE: return shuffle_minimise(pattern); + case MODE_EAGER_OUTPUT: + return fuzz_eager_output(data, size); + case MODE_ALL_PRINT_FUNCTIONS: { if (dev_null == NULL) { @@ -403,7 +925,7 @@ harness_fuzzer_target(const uint8_t *data, size_t size) const bool det = b0 & 0x1; const bool min = b0 & 0x2; const enum fsm_io io_mode = (b0 >> 2) % 3; - + const char *shifted_pattern = (const char *)&data_buf[1]; int res = fuzz_all_print_functions(dev_null, shifted_pattern, det, min, io_mode); return res; diff --git a/include/fsm/bool.h b/include/fsm/bool.h index d92518297..4d9f1889a 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -52,6 +52,16 @@ struct fsm * fsm_union_array(size_t fsm_count, struct fsm **fsms, struct fsm_combined_base_pair *bases); +struct fsm_union_entry { + struct fsm *fsm; + bool anchored_start; + bool anchored_end; +}; + +struct fsm * +fsm_union_repeated_pattern_group(size_t entry_count, + struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases); + struct fsm * fsm_intersect(struct fsm *a, struct fsm *b); diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 1dd710d0e..b862cb041 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -7,6 +7,7 @@ #ifndef FSM_H #define FSM_H +#include #include struct fsm; @@ -27,6 +28,9 @@ typedef unsigned int fsm_state_t; * original FSM(s) matched when executing a combined FSM. */ typedef unsigned int fsm_end_id_t; +/* Eager output ID. */ +typedef unsigned int fsm_output_id_t; + #define FSM_END_ID_MAX UINT_MAX /* @@ -266,6 +270,39 @@ fsm_mapendids(struct fsm * fsm, fsm_endid_remap_fun remap, void *opaque); void fsm_increndids(struct fsm * fsm, int delta); +/* Associate an eagerly matched numeric ID with the end states in an fsm. + * + * This is similar to fsm_setendid, but has different performance + * trade-offs. In particular, it can become extremely expensive to + * combine multiple DFAs with endids on their end states when they + * representing regexes with unanchored ends, because the FSM has to + * explicitly represent all the possible combinations of matches by + * copying the entire path to every reachable end state. Eager endids + * are associated with the edge leaving the main pattern match. + * + * Returns 1 on success, 0 on error. + * */ +int +fsm_seteagerendid(struct fsm *fsm, fsm_end_id_t id); + +/* Set an eager output ID to emit every time the state is entered. + * This turns the automata into a Moore machine. */ +int +fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); + +/* Set an eager output ID on all current end states. */ +int +fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id); + +/* HACK */ +typedef void +fsm_eager_output_cb(fsm_output_id_t id, void *opaque); +void +fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque); + +void +fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque); + /* * Find the state (if there is just one), or add epsilon edges from all states, * for which the given predicate is true. @@ -451,6 +488,15 @@ fsm_shortest(const struct fsm *fsm, fsm_state_t start, fsm_state_t goal, unsigned (*cost)(fsm_state_t from, fsm_state_t to, char c)); +/* HACK */ +typedef void +fsm_eager_endid_cb(fsm_end_id_t id, void *opaque); +void +fsm_eager_endid_set_cb(struct fsm *fsm, fsm_eager_endid_cb *cb, void *opaque); + +void +fsm_eager_endid_get_cb(const struct fsm *fsm, fsm_eager_endid_cb **cb, void **opaque); + /* * Execute an FSM reading input from the user-specified callback fsm_getc(). * fsm_getc() is passed the opaque pointer given, and is expected to return diff --git a/include/fsm/print.h b/include/fsm/print.h index c9ec7ec0a..a8aa4db54 100644 --- a/include/fsm/print.h +++ b/include/fsm/print.h @@ -45,6 +45,9 @@ enum fsm_print_lang { struct fsm_state_metadata { const fsm_end_id_t *end_ids; size_t end_id_count; + + const fsm_output_id_t *eager_output_ids; + size_t eager_output_count; }; /* diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 5e2ed57e3..c7782f0ff 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -8,6 +8,7 @@ SRC += src/libfsm/consolidate.c SRC += src/libfsm/clone.c SRC += src/libfsm/closure.c SRC += src/libfsm/detect_required.c +SRC += src/libfsm/eager_output.c SRC += src/libfsm/edge.c SRC += src/libfsm/empty.c SRC += src/libfsm/end.c diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index 9fd236a4d..2161599ae 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -19,6 +19,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define LOG_CLONE_ENDIDS 0 @@ -28,6 +29,9 @@ copy_capture_actions(struct fsm *dst, const struct fsm *src); static int copy_end_ids(struct fsm *dst, const struct fsm *src); +static int +copy_eager_output_ids(struct fsm *dst, const struct fsm *src); + struct fsm * fsm_clone(const struct fsm *fsm) { @@ -80,6 +84,12 @@ fsm_clone(const struct fsm *fsm) fsm_free(new); return NULL; } + + /* does not copy callback */ + if (!copy_eager_output_ids(new, fsm)) { + fsm_free(new); + return NULL; + } } return new; @@ -159,3 +169,31 @@ copy_end_ids(struct fsm *dst, const struct fsm *src) return env.ok; } + +struct copy_eager_output_ids_env { + bool ok; + struct fsm *dst; +}; + +static int +copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct copy_eager_output_ids_env *env = opaque; + if (!fsm_seteageroutput(env->dst, state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +copy_eager_output_ids(struct fsm *dst, const struct fsm *src) +{ + struct copy_eager_output_ids_env env; + env.dst = dst; + env.ok = true; + + fsm_eager_output_iter_all(src, copy_eager_output_ids_cb, &env); + return env.ok; +} diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index 236a4f6f5..b7a8905b2 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -25,6 +25,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define LOG_MAPPING 0 #define LOG_CONSOLIDATE_CAPTURES 0 @@ -53,6 +54,10 @@ static int consolidate_end_ids(struct fsm *dst, const struct fsm *src, const fsm_state_t *mapping, size_t mapping_count); +static int +consolidate_eager_output_ids(struct fsm *dst, const struct fsm *src, + const fsm_state_t *mapping, size_t mapping_count); + static fsm_state_t mapping_cb(fsm_state_t id, const void *opaque) { @@ -154,6 +159,10 @@ fsm_consolidate(const struct fsm *src, } } + if (!consolidate_eager_output_ids(dst, src, mapping, mapping_count)) { + goto cleanup; + } + f_free(src->alloc, seen); return dst; @@ -270,3 +279,40 @@ consolidate_end_ids(struct fsm *dst, const struct fsm *src, return ret; } + +struct consolidate_eager_output_ids_env { + bool ok; + struct fsm *dst; + const fsm_state_t *mapping; + size_t mapping_count; +}; + +static int +consolidate_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct consolidate_eager_output_ids_env *env = opaque; + assert(state < env->mapping_count); + const fsm_state_t dst_state = env->mapping[state]; + + if (!fsm_seteageroutput(env->dst, dst_state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +consolidate_eager_output_ids(struct fsm *dst, const struct fsm *src, + const fsm_state_t *mapping, size_t mapping_count) +{ + struct consolidate_eager_output_ids_env env = { + .ok = true, + .dst = dst, + .mapping = mapping, + .mapping_count = mapping_count, + }; + fsm_eager_output_iter_all(src, consolidate_eager_output_ids_cb, &env); + return env.ok; +} + diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index d5ba396a4..9833fd878 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -6,6 +6,9 @@ #include "determinise_internal.h" +#include +#include + static void dump_labels(FILE *f, const uint64_t labels[4]) { @@ -266,6 +269,10 @@ fsm_determinise_with_config(struct fsm *nfa, goto cleanup; } + if (!remap_eager_outputs(&map, issp, dfa, nfa)) { + goto cleanup; + } + fsm_move(nfa, dfa); } @@ -363,6 +370,22 @@ add_reverse_mapping(const struct fsm_alloc *alloc, return 1; } +static void +free_reverse_mappings(const struct fsm_alloc *alloc, size_t map_count, struct reverse_mapping *rmaps) +{ + if (rmaps == NULL) { return; } + + for (size_t map_i = 0; map_i < map_count; map_i++) { + struct reverse_mapping *rmap = &rmaps[map_i]; + for (size_t i = 0; i < rmap->count; i++) { + f_free(alloc, rmap[i].list); + rmap->count = 0; + rmap[i].list = NULL; + } + } + f_free(alloc, rmaps); +} + static int det_copy_capture_actions_cb(fsm_state_t state, enum capture_action_type type, unsigned capture_id, fsm_state_t to, @@ -434,7 +457,7 @@ hash_iss(interned_state_set_id iss) } static struct mapping * -map_first(struct map *map, struct map_iter *iter) +map_first(const struct map *map, struct map_iter *iter) { iter->m = map; iter->i = 0; @@ -672,22 +695,14 @@ stack_pop(struct mappingstack *stack) return item; } -static int -remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, - struct fsm *dst_dfa, struct fsm *src_nfa) +static struct reverse_mapping * +build_reverse_mappings(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa) { + struct reverse_mapping *reverse_mappings = NULL; struct map_iter it; struct state_iter si; struct mapping *m; - struct reverse_mapping *reverse_mappings; - fsm_state_t state; - const size_t capture_count = fsm_countcaptures(src_nfa); - size_t i, j; - int res = 0; - - if (capture_count == 0) { - return 1; - } /* This is not 1 to 1 -- if state X is now represented by multiple * states Y in the DFA, and state X has action(s) when transitioning @@ -698,9 +713,7 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, * checking reachability from every X, but the actual path * handling later will also check reachability. */ reverse_mappings = f_calloc(dst_dfa->alloc, src_nfa->statecount, sizeof(reverse_mappings[0])); - if (reverse_mappings == NULL) { - return 0; - } + if (reverse_mappings == NULL) { goto cleanup; } /* build reverse mappings table: for every NFA state X, if X is part * of the new DFA state Y, then add Y to a list for X */ @@ -710,6 +723,7 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, assert(m->dfastate < dst_dfa->statecount); ss = interned_state_set_get_state_set(issp, iss_id); + fsm_state_t state; for (state_set_reset(ss, &si); state_set_next(&si, &state); ) { if (!add_reverse_mapping(dst_dfa->alloc, reverse_mappings, @@ -719,33 +733,47 @@ remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, } } -#if LOG_DETERMINISE_CAPTURES +#if LOG_BUILD_REVERSE_MAPPING fprintf(stderr, "#### reverse mapping for %zu states\n", src_nfa->statecount); - for (i = 0; i < src_nfa->statecount; i++) { + for (size_t i = 0; i < src_nfa->statecount; i++) { struct reverse_mapping *rm = &reverse_mappings[i]; fprintf(stderr, "%lu:", i); - for (j = 0; j < rm->count; j++) { + for (size_t j = 0; j < rm->count; j++) { fprintf(stderr, " %u", rm->list[j]); } fprintf(stderr, "\n"); } -#else - (void)j; #endif + return reverse_mappings; + +cleanup: + free_reverse_mappings(dst_dfa->alloc, src_nfa->statecount, reverse_mappings); + return NULL; +} + +static int +remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, struct fsm *src_nfa) +{ + const size_t capture_count = fsm_countcaptures(src_nfa); + int res = 0; + + if (capture_count == 0) { + return 1; + } + + struct reverse_mapping *reverse_mappings = build_reverse_mappings(map, issp, dst_dfa, src_nfa); + if (reverse_mappings == NULL) { goto cleanup; } + if (!det_copy_capture_actions(reverse_mappings, dst_dfa, src_nfa)) { goto cleanup; } res = 1; -cleanup: - for (i = 0; i < src_nfa->statecount; i++) { - if (reverse_mappings[i].list != NULL) { - f_free(dst_dfa->alloc, reverse_mappings[i].list); - } - } - f_free(dst_dfa->alloc, reverse_mappings); +cleanup: + free_reverse_mappings(dst_dfa->alloc, src_nfa->statecount, reverse_mappings); return res; } @@ -2559,3 +2587,50 @@ analyze_closures__grow_outputs(struct analyze_closures_env *env) env->output_ceil = nceil; return 1; } + +struct remap_eager_output_env { + bool ok; + struct fsm *dst; + fsm_state_t dst_state; +}; + +static int +remap_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct remap_eager_output_env *env = opaque; + if (!fsm_seteageroutput(env->dst, env->dst_state, id)) { + env->ok = false; + return 0; + } + + return 1; +} + +static int +remap_eager_outputs(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa) +{ + /* For each DFA state, get the set of NFA states corresponding to it from the + * map and issp, then copy every eager output ID over. */ + struct map_iter iter; + for (struct mapping *b = map_first(map, &iter); b != NULL; b = map_next(&iter)) { + struct state_set *ss = interned_state_set_get_state_set(issp, b->iss); + assert(ss != NULL); + + struct state_iter it; + fsm_state_t s; + state_set_reset(ss, &it); + while (state_set_next(&it, &s)) { + struct remap_eager_output_env env = { + .ok = true, + .dst = dst_dfa, + .dst_state = b->dfastate, + }; + fsm_eager_output_iter_state(src_nfa, s, remap_eager_output_cb, &env); + if (!env.ok) { return 0; } + } + } + + return 1; +} diff --git a/src/libfsm/determinise_internal.h b/src/libfsm/determinise_internal.h index cfd4ea663..2e925d28c 100644 --- a/src/libfsm/determinise_internal.h +++ b/src/libfsm/determinise_internal.h @@ -23,6 +23,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #include @@ -35,6 +36,7 @@ #define LOG_AC 0 #define LOG_GROUPING 0 #define LOG_ANALYSIS_STATS 0 +#define LOG_BUILD_REVERSE_MAPPING 0 #if LOG_DETERMINISE_CAPTURES || LOG_INPUT #include @@ -72,7 +74,7 @@ struct map { }; struct map_iter { - struct map *m; + const struct map *m; size_t i; }; @@ -304,7 +306,7 @@ static void map_free(struct map *map); static struct mapping * -map_first(struct map *map, struct map_iter *iter); +map_first(const struct map *map, struct map_iter *iter); static struct mapping * map_next(struct map_iter *iter); @@ -325,6 +327,10 @@ static int remap_capture_actions(struct map *map, struct interned_state_set_pool *issp, struct fsm *dst_dfa, struct fsm *src_nfa); +static int +remap_eager_outputs(const struct map *map, struct interned_state_set_pool *issp, + struct fsm *dst_dfa, const struct fsm *src_nfa); + static struct mappingstack * stack_init(const struct fsm_alloc *alloc); diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c new file mode 100644 index 000000000..e37a8a4bf --- /dev/null +++ b/src/libfsm/eager_output.c @@ -0,0 +1,403 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + +#include +#include + +#include "internal.h" + +#include +#include + +#include +#include +#include + +#include "eager_output.h" + +#define LOG_LEVEL 0 + +/* must be a power of 2 */ +#define DEF_BUCKET_COUNT 4 +#define DEF_ENTRY_CEIL 2 + +struct eager_output_info { + fsm_eager_output_cb *cb; + void *opaque; + + struct eager_output_htab { + size_t bucket_count; + size_t buckets_used; + /* empty if entry is NULL, otherwise keyed by state */ + struct eager_output_bucket { + fsm_state_t state; + struct eager_output_entry { + unsigned used; + unsigned ceil; + fsm_end_id_t ids[]; + } *entry; + } *buckets; + } htab; +}; + +void +fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque) +{ +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_set_cb %p\n", (void *)fsm); +#endif + assert(fsm != NULL); + assert(fsm->eager_output_info != NULL); + fsm->eager_output_info->cb = cb; + fsm->eager_output_info->opaque = opaque; +} + +void +fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque) +{ + *cb = fsm->eager_output_info->cb; + *opaque = fsm->eager_output_info->opaque; +} + +int +fsm_eager_output_init(struct fsm *fsm) +{ + struct eager_output_info *ei = f_calloc(fsm->alloc, 1, sizeof(*ei)); + + if (ei == NULL) { return 0; } + + struct eager_output_bucket *buckets = f_calloc(fsm->alloc, + DEF_BUCKET_COUNT, sizeof(buckets[0])); + if (buckets == NULL) { + f_free(fsm->alloc, ei); + return 0; + } + +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_init %p\n", (void *)fsm); +#endif + + ei->htab.buckets = buckets; + ei->htab.bucket_count = DEF_BUCKET_COUNT; + + fsm->eager_output_info = ei; + return 1; +} + +void +fsm_eager_output_free(struct fsm *fsm) +{ + if (fsm == NULL || fsm->eager_output_info == NULL) { return; } + + for (size_t i = 0; i < fsm->eager_output_info->htab.bucket_count; i++) { + struct eager_output_bucket *b = &fsm->eager_output_info->htab.buckets[i]; + if (b->entry == NULL) { continue; } + f_free(fsm->alloc, b->entry); + } + f_free(fsm->alloc, fsm->eager_output_info->htab.buckets); + + f_free(fsm->alloc, fsm->eager_output_info); +#if LOG_LEVEL > 2 + fprintf(stderr, "-- fsm_eager_output_free %p\n", (void *)fsm); +#endif + fsm->eager_output_info = NULL; +} + +int +fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id) +{ + assert(fsm != NULL); + const size_t count = fsm_countstates(fsm); + for (size_t i = 0; i < count; i++) { + if (fsm_isend(fsm, i)) { + if (!fsm_seteageroutput(fsm, i, id)) { return 0; } + } + } + return 1; +} + +static bool +grow_htab(const struct fsm_alloc *alloc, struct eager_output_htab *htab) +{ + const size_t nbucket_count = 2*htab->bucket_count; + assert(nbucket_count != 0); + + struct eager_output_bucket *nbuckets = f_calloc(alloc, nbucket_count, + sizeof(nbuckets[0])); + if (nbuckets == NULL) { return false; } + + const uint64_t nmask = nbucket_count - 1; + assert((nmask & nbucket_count) == 0); /* power of 2 */ + + for (size_t ob_i = 0; ob_i < htab->bucket_count; ob_i++) { + struct eager_output_bucket *ob = &htab->buckets[ob_i]; + if (ob->entry == NULL) { continue; } + + const uint64_t hash = hash_id(ob->state); + for (size_t probes = 0; probes < nbucket_count; probes++) { + const size_t nb_i = (hash + probes) & nmask; + struct eager_output_bucket *nb = &nbuckets[nb_i]; + if (nb->entry == NULL) { + nb->state = ob->state; + nb->entry = ob->entry; + break; + } else { + assert(nb->state != ob->state); + } + } + } + + f_free(alloc, htab->buckets); + htab->bucket_count = nbucket_count; + htab->buckets = nbuckets; + return true; +} + +int +fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) +{ + assert(fsm != NULL); + + struct eager_output_info *info = fsm->eager_output_info; + assert(info->htab.bucket_count > 0); + + if (info->htab.buckets_used >= info->htab.bucket_count/2) { + if (!grow_htab(fsm->alloc, &info->htab)) { return 0; } + } + + const uint64_t hash = hash_id(state); + const uint64_t mask = info->htab.bucket_count - 1; + assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ + + /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ + for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { + const size_t b_i = (hash + probes) & mask; + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ + /* __func__, state, b_i, b->state, (void *)b->entry); */ + struct eager_output_entry *e = b->entry; + if (e == NULL) { /* empty */ + /* add */ + const size_t alloc_sz = sizeof(*e) + + DEF_ENTRY_CEIL * sizeof(e->ids[0]); + e = f_calloc(fsm->alloc, 1, alloc_sz); + if (e == NULL) { + return 0; + } + e->ceil = DEF_ENTRY_CEIL; + b->state = state; + b->entry = e; + info->htab.buckets_used++; + /* fprintf(stderr, "%s: buckets_used %zd\n", __func__, info->htab.buckets_used); */ + /* fprintf(stderr, "%s: saved new entry in bucket %zd\n", __func__, b_i); */ + } else if (b->state != state) { /* collision */ + continue; + } + + if (e->used == e->ceil) { + const size_t nceil = 2 * e->ceil; + const size_t nsize = sizeof(*e) + + nceil * sizeof(e->ids[0]); + struct eager_output_entry *nentry = f_realloc(fsm->alloc, e, nsize); + if (nentry == NULL) { return 0; } + nentry->ceil = nceil; + b->entry = nentry; + e = b->entry; + } + + /* ignore duplicates */ + for (size_t i = 0; i < e->used; i++) { + if (e->ids[i] == id) { return 1; } + } + + e->ids[e->used++] = id; + /* fprintf(stderr, "%s: e->ids_used %u\n", __func__, e->used); */ + fsm->states[state].has_eager_outputs = 1; + return 1; + } + + return 1; +} + +bool +fsm_eager_output_has_eager_output(const struct fsm *fsm) +{ + assert(fsm->eager_output_info != NULL); + const struct eager_output_htab *htab = &fsm->eager_output_info->htab; + + for (size_t b_i = 0; b_i < htab->bucket_count; b_i++) { + struct eager_output_bucket *b = &htab->buckets[b_i]; + if (b->entry == NULL) { continue; } + if (b->entry->used > 0) { return 1; } + } + return 0; +} + +bool +fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state) +{ + assert(state < fsm->statecount); + return fsm->states[state].has_eager_outputs; +} + +void +fsm_eager_output_iter_state(const struct fsm *fsm, + fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque) +{ + assert(fsm != NULL); + assert(cb != NULL); + + const uint64_t hash = hash_id(state); + + struct eager_output_info *info = fsm->eager_output_info; + const uint64_t mask = info->htab.bucket_count - 1; + assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ + + for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { + const size_t b_i = (hash + probes) & mask; + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ + /* __func__, state, b_i, b->state, (void *)b->entry); */ + struct eager_output_entry *e = b->entry; + if (e == NULL) { /* empty */ + return; + } else if (b->state != state) { /* collision */ + continue; + } + + assert(e->used == 0 || fsm->states[state].has_eager_outputs); + + for (size_t i = 0; i < e->used; i++) { + if (!cb(state, e->ids[i], opaque)) { return; } + } + } +} + +void +fsm_eager_output_iter_all(const struct fsm *fsm, + fsm_eager_output_iter_cb *cb, void *opaque) +{ + assert(fsm != NULL); + assert(cb != NULL); + assert(fsm->eager_output_info != NULL); + + struct eager_output_info *info = fsm->eager_output_info; + + /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ + for (size_t b_i = 0; b_i < info->htab.bucket_count; b_i++) { + struct eager_output_bucket *b = &info->htab.buckets[b_i]; + struct eager_output_entry *e = b->entry; + /* fprintf(stderr, "%s: b_i %zd, state %d, entry %p\n", */ + /* __func__, b_i, b->state, (void *)b->entry); */ + if (e == NULL) { /* empty */ + continue; + } + assert(e->used == 0 || fsm->states[b->state].has_eager_outputs); + + for (size_t i = 0; i < e->used; i++) { + if (!cb(b->state, e->ids[i], opaque)) { return; } + } + } +} + +struct dump_env { + FILE *f; + size_t count; +}; + +static int +dump_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) + +{ + struct dump_env *env = opaque; + fprintf(env->f, "-- %d: id %d\n", state, id); + env->count++; + return 1; +} + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm) +{ + struct dump_env env = { .f = f }; + fprintf(f, "%s:\n", __func__); + fsm_eager_output_iter_all(fsm, dump_cb, (void *)&env); + fprintf(f, "== %zu total\n", env.count); +} + +static int +inc_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + (void)id; + size_t *count = opaque; + (*count)++; + return 1; +} + +bool +fsm_eager_output_has_any(const struct fsm *fsm, + fsm_state_t state, size_t *count) +{ + size_t c = 0; + fsm_eager_output_iter_state(fsm, state, &inc_cb, &c); + if (count != NULL) { *count = c; } + return c > 0; +} + +int +fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count) +{ + /* Don't reallocate unless something has actually changed. */ + bool changes = false; + for (size_t i = 0; i < mapping_count; i++) { + if (mapping[i] != i) { + changes = true; + break; + } + } + + /* nothing to do */ + if (!changes) { return 1; } + + struct eager_output_info *eoi = fsm->eager_output_info; + + struct eager_output_bucket *nbuckets = f_calloc(fsm->alloc, + eoi->htab.bucket_count, sizeof(nbuckets[0])); + if (nbuckets == NULL) { + return 0; + } + + const uint64_t mask = eoi->htab.bucket_count - 1; + assert((eoi->htab.bucket_count & mask) == 0); + + for (size_t ob_i = 0; ob_i < eoi->htab.bucket_count; ob_i++) { + const struct eager_output_bucket *ob = &eoi->htab.buckets[ob_i]; + if (ob->entry == NULL) { continue; } + + assert(ob->state < mapping_count); + const fsm_state_t nstate = mapping[ob->state]; + if (nstate == FSM_STATE_REMAP_NO_STATE) { continue; } + + const uint64_t hash = hash_id(nstate); + + bool placed = false; + for (size_t probes = 0; probes < eoi->htab.bucket_count; probes++) { + const size_t nb_i = (hash + probes) & mask; + struct eager_output_bucket *nb = &nbuckets[nb_i]; + if (nb->entry == NULL) { + nb->state = nstate; + nb->entry = ob->entry; + placed = true; + break; + } + } + assert(placed); + } + + f_free(fsm->alloc, eoi->htab.buckets); + eoi->htab.buckets = nbuckets; + return 1; +} diff --git a/src/libfsm/eager_output.h b/src/libfsm/eager_output.h new file mode 100644 index 000000000..1b48ba4c4 --- /dev/null +++ b/src/libfsm/eager_output.h @@ -0,0 +1,46 @@ +#ifndef EAGER_OUTPUT_H +#define EAGER_OUTPUT_H + +#include +#include +#include + +struct eager_output_info; + +int +fsm_eager_output_init(struct fsm *fsm); + +void +fsm_eager_output_free(struct fsm *fsm); + +bool +fsm_eager_output_has_eager_output(const struct fsm *fsm); + +bool +fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state); + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +/* Callback for fsm_eager_output_iter_*. + * The return value indicates whether iteration should continue. + * The results may not be sorted in any particular order. */ +typedef int +fsm_eager_output_iter_cb(fsm_state_t state, fsm_output_id_t id, void *opaque); + +void +fsm_eager_output_iter_state(const struct fsm *fsm, + fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque); + +void +fsm_eager_output_iter_all(const struct fsm *fsm, + fsm_eager_output_iter_cb *cb, void *opaque); + +bool +fsm_eager_output_has_any(const struct fsm *fsm, + fsm_state_t state, size_t *count); + +int +fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count); + +#endif diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 9394a2d9b..adfcdec2a 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -9,24 +9,42 @@ #include #include #include +#include #include #include +#include #include #include #include #include +#include #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" #define DUMP_EPSILON_CLOSURES 0 #define DEF_PENDING_CAPTURE_ACTIONS_CEIL 2 #define LOG_RM_EPSILONS_CAPTURES 0 #define DEF_CARRY_ENDIDS_COUNT 2 +#define LOG_LEVEL 0 + +#if LOG_LEVEL > 0 +static bool log_it; +#define LOG(LVL, ...) \ + do { \ + if (log_it && LVL <= LOG_LEVEL) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) +#else +#define LOG(_LVL, ...) +#endif + struct remap_env { #ifndef NDEBUG char tag; @@ -57,6 +75,49 @@ static int carry_endids(struct fsm *fsm, struct state_set *states, fsm_state_t s); +static void +mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label); + +struct eager_output_buf { +#define DEF_EAGER_OUTPUT_BUF_CEIL 8 + bool ok; + const struct fsm_alloc *alloc; + size_t ceil; + size_t used; + fsm_output_id_t *ids; +}; + +static bool +append_eager_output_id(struct eager_output_buf *buf, fsm_output_id_t id) +{ + if (buf->used == buf->ceil) { + const size_t nceil = buf->ceil == 0 ? DEF_EAGER_OUTPUT_BUF_CEIL : 2*buf->ceil; + fsm_output_id_t *nids = f_realloc(buf->alloc, buf->ids, nceil * sizeof(nids[0])); + if (nids == NULL) { + buf->ok = false; + return false; + } + buf->ids = nids; + buf->ceil = nceil; + } + + for (size_t i = 0; i < buf->used; i++) { + /* avoid duplicates */ + if (buf->ids[i] == id) { return true; } + } + + buf->ids[buf->used++] = id; + return true; +} + +static int +collect_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct eager_output_buf *buf = opaque; + return append_eager_output_id(buf, id) ? 1 : 0; +} + int fsm_remove_epsilons(struct fsm *nfa) { @@ -64,9 +125,20 @@ fsm_remove_epsilons(struct fsm *nfa) int res = 0; struct state_set **eclosures = NULL; fsm_state_t s; + struct eager_output_buf eager_output_buf = { + .ok = true, + .alloc = nfa->alloc, + }; + uint64_t *reachable_by_label = NULL; + + LOG(2, "%s: starting\n", __func__); INIT_TIMERS(); +#if LOG_LEVEL > 0 + log_it = getenv("LOG") != NULL; +#endif + assert(nfa != NULL); TIME(&pre); @@ -94,6 +166,17 @@ fsm_remove_epsilons(struct fsm *nfa) } #endif + const size_t state_words = u64bitset_words(state_count); + reachable_by_label = f_calloc(nfa->alloc, state_words, sizeof(reachable_by_label[0])); + if (reachable_by_label == NULL) { goto cleanup; } + + mark_states_reachable_by_label(nfa, reachable_by_label); + + fsm_state_t start; + if (!fsm_getstart(nfa, &start)) { + goto cleanup; /* no start state */ + } + for (s = 0; s < state_count; s++) { struct state_iter si; fsm_state_t es_id; @@ -101,6 +184,12 @@ fsm_remove_epsilons(struct fsm *nfa) struct edge_group_iter egi; struct edge_group_iter_info info; + /* If the state isn't reachable by a label and isn't the start state, + * skip processing -- it will soon become garbage. */ + if (!u64bitset_get(reachable_by_label, s) && s != start) { + continue; + } + /* Process the epsilon closure. */ state_set_reset(eclosures[s], &si); while (state_set_next(&si, &es_id)) { @@ -129,6 +218,16 @@ fsm_remove_epsilons(struct fsm *nfa) } } + /* Collect every eager output ID from any state + * in the current state's epsilon closure to the + * current state. These will be added at the end. */ + { + if (fsm_eager_output_has_any(nfa, es_id, NULL)) { + fsm_eager_output_iter_state(nfa, es_id, collect_eager_output_ids_cb, &eager_output_buf); + if (!eager_output_buf.ok) { goto cleanup; } + } + } + /* For every state in this state's transitive * epsilon closure, add all of their sets of * labeled edges. */ @@ -144,6 +243,13 @@ fsm_remove_epsilons(struct fsm *nfa) } } } + + for (size_t i = 0; i < eager_output_buf.used; i++) { + if (!fsm_seteageroutput(nfa, s, eager_output_buf.ids[i])) { + goto cleanup; + } + } + eager_output_buf.used = 0; /* clear */ } /* Remove the epsilon-edge state sets from everything. @@ -170,13 +276,53 @@ fsm_remove_epsilons(struct fsm *nfa) res = 1; cleanup: + LOG(2, "%s: finishing\n", __func__); if (eclosures != NULL) { closure_free(nfa, eclosures, state_count); } + f_free(nfa->alloc, reachable_by_label); + f_free(nfa->alloc, eager_output_buf.ids); return res; } +/* For every state, mark every state reached by a labeled edge as + * reachable. This doesn't check that the FROM state is reachable from + * the start state (trim will do that soon enough), it's just used to + * check which states will become unreachable once epsilon edges are + * removed. We don't need to add eager endids for them, because they + * will soon be disconnected from the epsilon-free NFA. */ +static void +mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label) +{ + fsm_state_t start; + if (!fsm_getstart(nfa, &start)) { + return; /* nothing reachable */ + } + u64bitset_set(reachable_by_label, start); + + const fsm_state_t state_count = fsm_countstates(nfa); + + for (size_t s_i = 0; s_i < state_count; s_i++) { + struct edge_group_iter egi; + struct edge_group_iter_info info; + + struct fsm_state *s = &nfa->states[s_i]; + + /* Clear the visited flag, it will be used to avoid cycles. */ +#if 1 + assert(s->visited == 0); /* stale */ +#endif + s->visited = 0; + + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + LOG(1, "%s: reachable: %d\n", __func__, info.to); + u64bitset_set(reachable_by_label, info.to); + } + } +} + static int remap_capture_actions(struct fsm *nfa, struct state_set **eclosures) { @@ -425,4 +571,3 @@ carry_endids(struct fsm *fsm, struct state_set *states, return env.ok; } - diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index 9f7b21802..077494b8f 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -20,9 +20,12 @@ #include "internal.h" #include "capture.h" +#include "eager_output.h" #define LOG_EXEC 0 +#define LOG_EAGER 0 + static int transition(const struct fsm *fsm, fsm_state_t state, int c, size_t offset, struct fsm_capture *captures, @@ -43,6 +46,44 @@ transition(const struct fsm *fsm, fsm_state_t state, int c, return 1; } +struct check_eager_outputs_for_state_env { + const struct fsm *fsm; + fsm_eager_output_cb *cb; + void *opaque; +}; + +static int +match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) +{ + /* HACK update the types here once it's working */ + (void)state; + struct check_eager_outputs_for_state_env *env = opaque; +#if LOG_EAGER + fprintf(stderr, "%s: state %d, id %d\n", __func__, state, id); +#endif + env->cb(id, env->opaque); + return 1; +} + +static int +match_eager_outputs_for_state(const struct fsm *fsm, fsm_state_t state) +{ + /* HACK update the types here once it's working */ + fsm_eager_output_cb *cb = NULL; + void *opaque = NULL; + fsm_eager_output_get_cb(fsm, &cb, &opaque); + if (cb == NULL) { return 1; } /* nothing to do */ + + struct check_eager_outputs_for_state_env env = { + .fsm = fsm, + .cb = cb, + .opaque = opaque, + }; + fsm_eager_output_iter_state(fsm, + state, match_eager_outputs_for_state_cb, &env); + return 1; +} + int fsm_exec(const struct fsm *fsm, int (*fsm_getc)(void *opaque), void *opaque, @@ -73,6 +114,7 @@ fsm_exec(const struct fsm *fsm, errno = EINVAL; return -1; } + const fsm_state_t start = state; for (i = 0; i < capture_count; i++) { captures[i].pos[0] = FSM_CAPTURE_NO_POS; @@ -83,6 +125,12 @@ fsm_exec(const struct fsm *fsm, fprintf(stderr, "fsm_exec: starting at %d\n", state); #endif + if (fsm->states[start].has_eager_outputs) { + if (!match_eager_outputs_for_state(fsm, start)) { + return 0; + } + } + while (c = fsm_getc(opaque), c != EOF) { if (!transition(fsm, state, c, offset, captures, &state)) { #if LOG_EXEC @@ -91,6 +139,12 @@ fsm_exec(const struct fsm *fsm, return 0; } + if (fsm->states[state].has_eager_outputs) { + if (!match_eager_outputs_for_state(fsm, state)) { + return 0; + } + } + #if LOG_EXEC fprintf(stderr, "fsm_exec: @ %zu, input '%c', new state %u\n", offset, c, state); @@ -113,4 +167,3 @@ fsm_exec(const struct fsm *fsm, *end = state; return 1; } - diff --git a/src/libfsm/fsm.c b/src/libfsm/fsm.c index ba2d2db26..c442c8262 100644 --- a/src/libfsm/fsm.c +++ b/src/libfsm/fsm.c @@ -21,6 +21,7 @@ #include "internal.h" #include "capture.h" #include "endids.h" +#include "eager_output.h" /* guess for default state allocation */ #define FSM_DEFAULT_STATEALLOC 128 @@ -39,6 +40,7 @@ free_contents(struct fsm *fsm) fsm_capture_free(fsm); fsm_endid_free(fsm); + fsm_eager_output_free(fsm); f_free(fsm->alloc, fsm->states); } @@ -92,6 +94,14 @@ fsm_new_statealloc(const struct fsm_alloc *alloc, size_t statealloc) return NULL; } + if (!fsm_eager_output_init(new)) { + f_free(new->alloc, new->states); + f_free(new->alloc, new); + fsm_capture_free(new); + fsm_endid_free(new); + return NULL; + } + return new; } @@ -133,6 +143,7 @@ fsm_move(struct fsm *dst, struct fsm *src) dst->capture_info = src->capture_info; dst->endid_info = src->endid_info; + dst->eager_output_info = src->eager_output_info; f_free(src->alloc, src); } diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index f84bbef0f..46997c82a 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -60,6 +60,10 @@ struct fsm_state { /* meaningful within one particular transformation only */ unsigned int visited:1; + + /* If 0, then this state has no need for checking + * the fsm->eager_output_info struct. */ + unsigned int has_eager_outputs:1; }; struct fsm { @@ -75,6 +79,7 @@ struct fsm { struct fsm_capture_info *capture_info; struct endid_info *endid_info; + struct eager_output_info *eager_output_info; }; struct fsm * diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index 67497d00a..f645c4ceb 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -2,6 +2,7 @@ fsm_complement fsm_union fsm_union_array +fsm_union_repeated_pattern_group fsm_intersect fsm_intersect_charset @@ -72,6 +73,8 @@ fsm_removestate fsm_shuffle fsm_vacuum +fsm_new_statealloc + fsm_addedge_any fsm_addedge_epsilon fsm_addedge_literal @@ -95,6 +98,14 @@ fsm_setendid fsm_mapendids fsm_increndids +fsm_endid_dump + +fsm_seteageroutput +fsm_seteageroutputonends +# short term hack +fsm_eager_output_set_cb +fsm_eager_output_dump + fsm_countedges fsm_countstates diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index 8c972c145..ccc1568ff 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -22,6 +22,7 @@ #include "capture.h" #include "internal.h" #include "endids.h" +#include "eager_output.h" #define LOG_MERGE_ENDIDS 0 @@ -39,6 +40,9 @@ copy_capture_actions(struct fsm *dst, struct fsm *src); static int copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); +static int +copy_eager_output_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src); + static struct fsm * merge(struct fsm *dst, struct fsm *src, fsm_state_t *base_dst, fsm_state_t *base_src, @@ -113,6 +117,11 @@ merge(struct fsm *dst, struct fsm *src, return NULL; } + if (!copy_eager_output_ids(dst, src, *base_src)) { + /* non-recoverable -- destructive operation */ + return NULL; + } + f_free(src->alloc, src->states); src->states = NULL; src->statealloc = 0; @@ -194,6 +203,39 @@ copy_end_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) return fsm_endid_iter_bulk(src, copy_end_ids_cb, &env); } +struct copy_eager_output_ids_env { + bool ok; + struct fsm *dst; + struct fsm *src; + fsm_state_t base_src; +}; + +static int +copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct copy_eager_output_ids_env *env = opaque; + if (!fsm_seteageroutput(env->dst, state + env->base_src, id)) { + env->ok = false; + return 0; + } + + return 1; + +} + +static int +copy_eager_output_ids(struct fsm *dst, struct fsm *src, fsm_state_t base_src) +{ + struct copy_eager_output_ids_env env = { + .ok = true, + .dst = dst, + .src = src, + .base_src = base_src, + }; + fsm_eager_output_iter_all(src, copy_eager_output_ids_cb, &env); + return env.ok; +} + struct fsm * fsm_mergeab(struct fsm *a, struct fsm *b, fsm_state_t *base_b) diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index a8d53c57e..86f00b46f 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -25,6 +25,8 @@ #include "internal.h" #include "capture.h" +#include "eager_output.h" +#include "endids.h" #define LOG_MAPPINGS 0 #define LOG_STEPS 0 @@ -54,12 +56,21 @@ struct end_metadata { unsigned count; fsm_end_id_t *ids; } end; + + struct end_metadata_eager_outputs { + unsigned count; + fsm_output_id_t *ids; + } eager_outputs; }; static int collect_end_ids(const struct fsm *fsm, fsm_state_t s, struct end_metadata_end *e); +static int +collect_eager_output_ids(const struct fsm *fsm, fsm_state_t s, + struct end_metadata_eager_outputs *e); + int fsm_minimise(struct fsm *fsm) { @@ -122,6 +133,10 @@ fsm_minimise(struct fsm *fsm) /* Minimisation should never add states. */ assert(minimised_states <= orig_states); + for (size_t i = 0; i < fsm->statecount; i++) { + assert(mapping[i] < fsm->statecount); + } + /* Use the mapping to consolidate the current states * into a new DFA, combining states that could not be * proven distinguishable. */ @@ -693,6 +708,9 @@ same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) if (a->end.count != b->end.count) { return 0; } + if (a->eager_outputs.count != b->eager_outputs.count) { + return 0; + } /* compare -- these must be sorted */ @@ -702,6 +720,12 @@ same_end_metadata(const struct end_metadata *a, const struct end_metadata *b) } } + for (size_t i = 0; i < a->eager_outputs.count; i++) { + if (a->eager_outputs.ids[i] != b->eager_outputs.ids[i]) { + return 0; + } + } + return 1; } @@ -750,14 +774,21 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) #endif while (s != NO_ID) { struct end_metadata *e = &end_md[s]; - if (!fsm_isend(fsm, s)) { - break; /* this EC has non-end states, skip */ + const bool is_end = fsm_isend(fsm, s); + const bool has_eager_outputs = fsm_eager_output_state_has_eager_output(fsm, s); + + if (!is_end && !has_eager_outputs) { + break; /* skip */ } if (!collect_end_ids(fsm, s, &e->end)) { goto cleanup; } + if (!collect_eager_output_ids(fsm, s, &e->eager_outputs)) { + goto cleanup; + } + s = env->jump[s]; } } @@ -789,6 +820,10 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) incremental_hash_of_ids(&hash, s_md->end.ids[eid_i]); } + for (size_t eo_i = 0; eo_i < s_md->eager_outputs.count; eo_i++) { + incremental_hash_of_ids(&hash, s_md->eager_outputs.ids[eo_i]); + } + for (size_t b_i = 0; b_i < bucket_count; b_i++) { fsm_state_t *b = &htab[(b_i + hash) & mask]; const fsm_state_t other = *b; @@ -932,6 +967,9 @@ split_ecs_by_end_metadata(struct min_env *env, const struct fsm *fsm) if (e->end.ids != NULL) { f_free(fsm->alloc, e->end.ids); } + if (e->eager_outputs.ids != NULL) { + f_free(fsm->alloc, e->eager_outputs.ids); + } } f_free(fsm->alloc, end_md); } @@ -959,7 +997,7 @@ collect_end_ids(const struct fsm *fsm, fsm_state_t s, #if LOG_ECS fprintf(stderr, "%d:", s); - for (size_t i = 0; i < written; i++) { + for (size_t i = 0; i < e->count; i++) { fprintf(stderr, " %u", e->ids[i]); } fprintf(stderr, "\n"); @@ -968,6 +1006,41 @@ collect_end_ids(const struct fsm *fsm, fsm_state_t s, return 1; } +static int +collect_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + struct end_metadata_eager_outputs *e = opaque; + e->ids[e->count++] = id; + return 1; +} + +static int cmp_eager_output_id(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +static int +collect_eager_output_ids(const struct fsm *fsm, fsm_state_t state, + struct end_metadata_eager_outputs *e) +{ + size_t count = 0; + if (!fsm_eager_output_has_any(fsm, state, &count)) { + return 1; /* nothing to do */ + } + + e->ids = f_malloc(fsm->alloc, count * sizeof(e->ids[0])); + if (e->ids == NULL) { return 0; } + + fsm_eager_output_iter_state(fsm, state, collect_cb, e); + + /* sort, to normalize set */ + qsort(e->ids, e->count, sizeof(e->ids[0]), cmp_eager_output_id); + return 1; +} + #if EXPENSIVE_CHECKS static void check_done_ec_offset(const struct min_env *env) diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index f55b2e748..b10cd9ace 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -228,6 +228,14 @@ print_case(FILE *f, const struct ir *ir, fsm_state_t state_id, .end_id_count = ir->states[state_id].endids.count, }; + if (cs->eager_outputs != NULL && opt->fragment) { + /* If .fragment is set and the state has eager outputs, then emit a call to a + * macro (the caller is expected to define). This is a temporary interface. */ + for (size_t i = 0; i < cs->eager_outputs->count; i++) { + fprintf(f, "\t\t\tFSM_SET_EAGER_OUTPUT(%u);\n", cs->eager_outputs->ids[i]); + } + } + switch (cs->strategy) { case IR_NONE: fprintf(f, "\t\t\t"); @@ -383,6 +391,11 @@ print_endstates(FILE *f, const struct fsm_state_metadata state_metadata = { .end_ids = ir->states[i].endids.ids, .end_id_count = ir->states[i].endids.count, + + .eager_output_count = (ir->states[i].eager_outputs == NULL + ? 0 : ir->states[i].eager_outputs->count), + .eager_output_ids = (ir->states[i].eager_outputs == NULL + ? NULL : ir->states[i].eager_outputs->ids), }; if (-1 == print_hook_accept(f, opt, hooks, diff --git a/src/libfsm/print/ir.c b/src/libfsm/print/ir.c index 457716dcc..81d5890e0 100644 --- a/src/libfsm/print/ir.c +++ b/src/libfsm/print/ir.c @@ -26,6 +26,7 @@ #include #include "libfsm/internal.h" +#include "libfsm/eager_output.h" #include "ir.h" @@ -505,6 +506,23 @@ make_example(const struct fsm *fsm, fsm_state_t s, char **example) return 0; } +static int +append_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct ir_state_eager_output *outputs = opaque; + (void)state; + outputs->ids[outputs->count++] = id; + return 1; +} + +static int +cmp_fsm_output_id_t(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + struct ir * make_ir(const struct fsm *fsm, const struct fsm_options *opt) { @@ -544,6 +562,8 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.ids = NULL; ir->states[i].endids.count = 0; + ir->states[i].eager_outputs = NULL; + if (fsm_isend(fsm, i)) { fsm_end_id_t *ids; size_t count; @@ -567,6 +587,20 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.count = count; } + size_t count; + if (fsm_eager_output_has_any(fsm, i, &count)) { + struct ir_state_eager_output *outputs = f_malloc(fsm->alloc, + sizeof(*outputs) + count * sizeof(outputs->ids[0])); + if (outputs == NULL) { + goto error; + } + outputs->count = 0; + fsm_eager_output_iter_state(fsm, i, append_eager_output_cb, outputs); + assert(outputs->count == count); + qsort(outputs->ids, outputs->count, sizeof(outputs->ids[0]), cmp_fsm_output_id_t); + ir->states[i].eager_outputs = outputs; + } + if (make_state(fsm, i, &ir->states[i]) == -1) { goto error; } @@ -630,6 +664,7 @@ free_ir(const struct fsm *fsm, struct ir *ir) for (i = 0; i < ir->n; i++) { f_free(fsm->alloc, (void *) ir->states[i].example); f_free(fsm->alloc, (void *) ir->states[i].endids.ids); + f_free(fsm->alloc, (void *) ir->states[i].eager_outputs); switch (ir->states[i].strategy) { case IR_TABLE: diff --git a/src/libfsm/print/ir.h b/src/libfsm/print/ir.h index 074097da3..b4b93a9eb 100644 --- a/src/libfsm/print/ir.h +++ b/src/libfsm/print/ir.h @@ -59,6 +59,11 @@ struct ir_state { size_t count; } endids; + struct ir_state_eager_output { + size_t count; + fsm_output_id_t ids[]; + } *eager_outputs; /* NULL -> 0 */ + unsigned int isend:1; enum ir_strategy strategy; diff --git a/src/libfsm/state.c b/src/libfsm/state.c index c845cbe46..d96c33653 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -19,6 +19,7 @@ #include "internal.h" #include "endids.h" +#include "eager_output.h" int fsm_addstate(struct fsm *fsm, fsm_state_t *state) @@ -44,6 +45,7 @@ fsm_addstate(struct fsm *fsm, fsm_state_t *state) for (i = fsm->statealloc; i < n; i++) { tmp[i].has_capture_actions = 0; + tmp[i].has_eager_outputs = 0; } fsm->statealloc = n; @@ -87,6 +89,8 @@ fsm_addstate_bulk(struct fsm *fsm, size_t n) new->visited = 0; new->epsilons = NULL; new->edges = NULL; + + new->has_eager_outputs = 0; } fsm->statecount += n; @@ -259,6 +263,10 @@ fsm_compact_states(struct fsm *fsm, if (!fsm_endid_compact(fsm, mapping, orig_statecount)) { return 0; } + if (!fsm_eager_output_compact(fsm, mapping, orig_statecount)) { + return 0; + } + assert(dst == kept); assert(kept == fsm->statecount); diff --git a/src/libfsm/union.c b/src/libfsm/union.c index a3b4b230c..0b18cd30c 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -15,9 +15,14 @@ #include #include #include +#include +#include #include "internal.h" +#include +#include "eager_output.h" + #define LOG_UNION_ARRAY 0 struct fsm * @@ -151,3 +156,231 @@ fsm_union_array(size_t fsm_count, return res; } + +#define LOG_UNION_REPEATED_PATTERN_GROUP 0 + +/* Combine an array of FSMs into a single FSM in one pass, with an extra loop + * so that more than one pattern with eager outputs can match. */ +struct fsm * +fsm_union_repeated_pattern_group(size_t entry_count, + struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases) +{ + const struct fsm_alloc *alloc = entries[0].fsm->alloc; + const bool log = 0 || LOG_UNION_REPEATED_PATTERN_GROUP; + + if (entry_count == 1) { + return entries[0].fsm; + } + + size_t est_total_states = 0; + for (size_t i = 0; i < entry_count; i++) { + assert(entries[i].fsm); + if (entries[i].fsm->alloc != alloc) { + errno = EINVAL; + return NULL; + } + const size_t count = fsm_countstates(entries[i].fsm); + est_total_states += count; + } + + est_total_states += 5; /* new start and end, new unanchored start and end loops */ + + struct fsm *res = fsm_new_statealloc(alloc, est_total_states); + if (res == NULL) { return NULL; } + + /* collected end states */ + struct ends_buf { + size_t ceil; + size_t used; + fsm_state_t *states; + } ends = { .ceil = 0 }; + + /* The new overall start state, which will have an epsilon edge to... */ + fsm_state_t global_start; + if (!fsm_addstate(res, &global_start)) { goto fail; } + + /* states linking to the starts of unanchored and anchored subgraphs, respectively. */ + fsm_state_t global_start_loop, global_start_anchored; + if (!fsm_addstate(res, &global_start_loop)) { goto fail; } + if (!fsm_addstate(res, &global_start_anchored)) { goto fail; } + + /* The unanchored end loop state, and an end state with no outgoing edges. */ + fsm_state_t global_end_loop, global_end; + if (!fsm_addstate(res, &global_end)) { goto fail; } + if (!fsm_addstate(res, &global_end_loop)) { goto fail; } + + /* link the start to the start loop and anchored start, and the start loop to itself */ + if (log) { + fprintf(stderr, "link_before: global_start %d -> global_start_loop %d and global_start_anchored %d\n", + global_start, global_start_loop, global_start_anchored); + } + if (!fsm_addedge_epsilon(res, global_start, global_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_start, global_start_anchored)) { goto fail; } + if (!fsm_addedge_any(res, global_start_loop, global_start_loop)) { goto fail; } + + /* link the end loop and end */ + if (log) { + fprintf(stderr, "link_before: global_end_loop %d -> global_end %d (and -> self)\n", global_end_loop, global_end); + } + if (!fsm_addedge_epsilon(res, global_end_loop, global_end)) { goto fail; } + if (!fsm_addedge_any(res, global_end_loop, global_end_loop)) { goto fail; } + + if (bases != NULL) { + memset(bases, 0x00, entry_count * sizeof(bases[0])); + } + + for (size_t fsm_i = 0; fsm_i < entry_count; fsm_i++) { + ends.used = 0; /* reset */ + + struct fsm *fsm = entries[fsm_i].fsm; + entries[fsm_i].fsm = NULL; /* transfer ownership */ + + const size_t state_count = fsm_countstates(fsm); + + fsm_state_t fsm_start; + if (!fsm_getstart(fsm, &fsm_start)) { + fsm_free(fsm); /* no start, just discard */ + continue; + } + + for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { + if (fsm_isend(fsm, s_i)) { + if (ends.used == ends.ceil) { /* grow? */ + size_t nceil = (ends.ceil == 0 ? 4 : 2*ends.ceil); + fsm_state_t *nstates = f_realloc(alloc, + ends.states, nceil * sizeof(nstates[0])); + if (nstates == NULL) { goto fail; } + ends.ceil = nceil; + ends.states = nstates; + } + ends.states[ends.used++] = s_i; + } + } + + if (ends.used == 0) { + fsm_free(fsm); /* no ends, just discard */ + continue; + } + + /* When combining these, remove self-edges from any states on the FSMs to be + * combined that also have eager output IDs. We are about to add an epsilon edge + * from each to a shared state that won't have eager output IDs. + * + * Eager output matching should be idempotent, so carrying it to other reachable + * state is redundant, and it leads to a combinatorial explosion that blows up the + * state count while determinising the combined FSM otherwise. + * + * For example, if /aaa/, /bbb/, and /ccc/ are combined into a DFA that repeats + * the sub-patterns (like `^.*(?:(aaa)|(bbb)|(ccc))+.*$`), the self-edge at each + * eager output state would combine with every reachable state from then on, + * leading to a copy of the whole reachable subgraph colored by every + * combination of eager output IDs: aaa, bbb, ccc, aaa+bbb, aaa+ccc, + * bbb+ccc, aaa+bbb+ccc. Instead of three relatively separate subgraphs + * that set the eager output at their last state, one for each pattern, + * it leads to 8 (2**3) subgraph clusters because it encodes _each + * distinct combination_ in the DFA. This becomes incredibly expensive + * as the combined pattern count increases; it's essentially what I'm + * trying to avoid by adding eager output support in the first place. + * + * FIXME: instead of actively removing these, filter in fsm_determinise? */ + if (fsm_eager_output_has_eager_output(fsm)) { + /* for any state that has eager outputs and a self edge, + * remove the self edge before further linkage */ + for (fsm_state_t s = 0; s < fsm->statecount; s++) { + if (!fsm_eager_output_has_any(fsm, s, NULL)) { continue; } + struct edge_set *edges = fsm->states[s].edges; + struct edge_set *new = edge_set_new(); + + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + if (info.to != s) { + if (!edge_set_add_bulk(&new, fsm->alloc, + info.symbols, info.to)) { + goto fail; + } + } + } + edge_set_free(fsm->alloc, edges); + fsm->states[s].edges = new; + } + } + + /* call fsm_merge; we really don't care which is which */ + struct fsm_combine_info combine_info; + struct fsm *merged = fsm_merge(res, fsm, &combine_info); + if (merged == NULL) { goto fail; } + + /* update offsets if res had its state IDs shifted forward */ + global_start += combine_info.base_a; + global_start_loop += combine_info.base_a; + global_start_anchored += combine_info.base_a;; + global_end += combine_info.base_a; + global_end_loop += combine_info.base_a; + + /* also update offsets for the FSM's states */ + fsm_start += combine_info.base_b; + for (size_t i = 0; i < ends.used; i++) { + ends.states[i] += combine_info.base_b; + } + + if (bases != NULL) { + bases[fsm_i].state = combine_info.base_b; + bases[fsm_i].capture = combine_info.capture_base_b; + } + + if (log) { + fprintf(stderr, "%s: fsm[%zd].start: %d\n", __func__, fsm_i, fsm_start); + for (size_t i = 0; i < ends.used; i++) { + fprintf(stderr, "%s: fsm[%zd].ends[%zd]: %d\n", __func__, fsm_i, i, ends.states[i]); + } + } + + /* link to the FSM's start state */ + const fsm_state_t start_src = entries[fsm_i].anchored_start ? global_start_anchored : global_start_loop; + if (!fsm_addedge_epsilon(merged, start_src, fsm_start)) { goto fail; } + if (log) { + fprintf(stderr, "%s: linking %s %d to fsm[%zd]'s start %d (anchored? %d)\n", + __func__, + entries[fsm_i].anchored_start ? "global_start_anchored" : "global_start_loop", + start_src, fsm_i, fsm_start, entries[fsm_i].anchored_start); + } + + /* link from the FSM's ends */ + const fsm_state_t end_dst = entries[fsm_i].anchored_end ? global_end : global_end_loop; + for (size_t i = 0; i < ends.used; i++) { + if (log) { + fprintf(stderr, "%s: linking fsm[%zd]'s end[%zd] %d (anchored? %d) to %s %d\n", + __func__, fsm_i, i, ends.states[i], entries[fsm_i].anchored_end, + entries[fsm_i].anchored_end ? "global_end" : "global_end_loop", + end_dst); + } + if (!fsm_addedge_epsilon(merged, ends.states[i], end_dst)) { goto fail; } + } + + res = merged; + } + + /* Link from the global_end_loop to the global_start_loop, so patterns with an + * unanchored start can follow other patterns with an unanchored end. */ + if (log) { + fprintf(stderr, "%s: g_start %d, g_start_loop %d, g_start_anchored %d, g_end_loop %d, g_end %d (after all merging)\n", + __func__, global_start, global_start_loop, global_start_anchored, global_end_loop, global_end); + fprintf(stderr, "%s: linking global_end_loop %d to global_start_loop %d\n", + __func__, global_end_loop, global_start_loop); + fprintf(stderr, "%s: setting global_start %d and end %d\n", __func__, global_start, global_end); + } + if (!fsm_addedge_epsilon(res, global_end_loop, global_start_loop)) { goto fail; } + + /* This needs to be set after merging, because that clears the start state. */ + fsm_setstart(res, global_start); + fsm_setend(res, global_end, 1); + + f_free(alloc, ends.states); + return res; + +fail: + f_free(alloc, ends.states); + return NULL; +} diff --git a/tests/eager_output/Makefile b/tests/eager_output/Makefile new file mode 100644 index 000000000..a650bf802 --- /dev/null +++ b/tests/eager_output/Makefile @@ -0,0 +1,22 @@ +.include "../../share/mk/top.mk" + +TEST.tests/eager_output != ls -1 tests/eager_output/eager_output*.c +TEST_SRCDIR.tests/eager_output = tests/eager_output +TEST_OUTDIR.tests/eager_output = ${BUILD}/tests/eager_output + +.for n in ${TEST.tests/eager_output:T:R:C/^eager_output//} +INCDIR.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c += src/adt +.endfor + +SRC += ${TEST_SRCDIR.tests/eager_output}/utils.c + +.for n in ${TEST.tests/eager_output:T:R:C/^eager_output//} +test:: ${TEST_OUTDIR.tests/eager_output}/res${n} +SRC += ${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c +CFLAGS.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c += -UNDEBUG + +${TEST_OUTDIR.tests/eager_output}/run${n}: ${TEST_OUTDIR.tests/eager_output}/eager_output${n}.o ${TEST_OUTDIR.tests/eager_output}/utils.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/eager_output}/eager_output${n}.c} -o ${TEST_OUTDIR.tests/eager_output}/run${n} ${TEST_OUTDIR.tests/eager_output}/eager_output${n}.o ${TEST_OUTDIR.tests/eager_output}/utils.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a +${TEST_OUTDIR.tests/eager_output}/res${n}: ${TEST_OUTDIR.tests/eager_output}/run${n} + ( ${TEST_OUTDIR.tests/eager_output}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/eager_output}/res${n} +.endfor diff --git a/tests/eager_output/eager_output1.c b/tests/eager_output/eager_output1.c new file mode 100644 index 000000000..f20ef77b7 --- /dev/null +++ b/tests/eager_output/eager_output1.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "abc" }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output2.c b/tests/eager_output/eager_output2.c new file mode 100644 index 000000000..cdac204e2 --- /dev/null +++ b/tests/eager_output/eager_output2.c @@ -0,0 +1,17 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab(c|d|e)" }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "abd", .expected_ids = { 1 } }, + { .input = "abe", .expected_ids = { 1 } }, + { .input = "Xabe", .expected_ids = { 1 } }, + { .input = "abeX", .expected_ids = { 1 } }, + { .input = "XabeX", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output3.c b/tests/eager_output/eager_output3.c new file mode 100644 index 000000000..c11bc58a4 --- /dev/null +++ b/tests/eager_output/eager_output3.c @@ -0,0 +1,16 @@ +#include "utils.h" + +/* test that eager endids are correctly propagated through fsm_determinise() and fsm_minimise() */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab(c|d|e)?" }, + .inputs = { + { .input = "ab", .expected_ids = { 1 } }, + { .input = "abc", .expected_ids = { 1 } }, + { .input = "abd", .expected_ids = { 1 } }, + { .input = "abe", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output4.c b/tests/eager_output/eager_output4.c new file mode 100644 index 000000000..47cd32029 --- /dev/null +++ b/tests/eager_output/eager_output4.c @@ -0,0 +1,13 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "abcde$" }, + .inputs = { + { .input = "abcde", .expected_ids = { 1 } }, + { .input = "Xabcde", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output5.c b/tests/eager_output/eager_output5.c new file mode 100644 index 000000000..4551c68b1 --- /dev/null +++ b/tests/eager_output/eager_output5.c @@ -0,0 +1,14 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "^abc$", "^ab*c$" }, + .inputs = { + { .input = "ac", .expected_ids = { 2 } }, + { .input = "abc", .expected_ids = { 1, 2 } }, + { .input = "abbc", .expected_ids = { 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output6.c b/tests/eager_output/eager_output6.c new file mode 100644 index 000000000..5431d0981 --- /dev/null +++ b/tests/eager_output/eager_output6.c @@ -0,0 +1,34 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "apple", + "banana", + "carrot", + "durian", + "eggplant", + "fig", + "grapefruit", + "hazelnut", + "iceberg lettuce", + "jicama", + }, + .inputs = { + { .input = "apple", .expected_ids = { 1 } }, + { .input = "banana", .expected_ids = { 2 } }, + { .input = "carrot", .expected_ids = { 3 } }, + { .input = "durian", .expected_ids = { 4 } }, + { .input = "eggplant", .expected_ids = { 5 } }, + { .input = "fig", .expected_ids = { 6 } }, + { .input = "grapefruit", .expected_ids = { 7 } }, + { .input = "hazelnut", .expected_ids = { 8 } }, + { .input = "iceberg lettuce", .expected_ids = { 9 } }, + { .input = "jicama", .expected_ids = { 10 } }, + { .input = "apple banana carrot", .expected_ids = { 1, 2, 3 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output7.c b/tests/eager_output/eager_output7.c new file mode 100644 index 000000000..3d123878b --- /dev/null +++ b/tests/eager_output/eager_output7.c @@ -0,0 +1,103 @@ +#include "utils.h" + +int main(void) +{ + /* Run this test with env FORCE_ENDIDS=N ... to see how much more + * expensive it is to combine the first N patterns using endids, + * rather than eager_outputs. It becomes VERY slow for >= 9 or so. + * (Note that the checks probably will not pass for N < 4, because + * it will start skipping appear in the early test inputs.) */ + bool force_endids = false; + size_t force_endid_count = 0; + { + const char *str = getenv("FORCE_ENDIDS"); + if (str != NULL) { + force_endid_count = atoi(str); + if (force_endid_count == 0) { + force_endid_count = 26; + } + force_endids = true; + } + } + + struct eager_output_test test = { + .patterns = { + [0] = "apple", + [1] = "banana", + [2] = "carrot", + [3] = "durian", + [4] = "eggplant", + [5] = "fig", + [6] = "grapefruit", + [7] = "hazelnut", + [8] = "iceberg lettuce", + [9] = "jicama", + [10] = "kiwano", + [11] = "lemon", + [12] = "mango", + [13] = "nectarine", + [14] = "orange", + [15] = "plum", + [16] = "quince", + [17] = "radish", + [18] = "strawberry", + [19] = "turnip", + [20] = "ube", + [21] = "vanilla", + [22] = "watermelon", + [23] = "xigua watermelon", + [24] = "yam", + [25] = "zucchini", + }, + .inputs = { + /* Note: expected IDs are shifted by 1, it's 0-terminated. */ + { .input = "apple", .expected_ids = { 1 } }, + { .input = "banana", .expected_ids = { 2 } }, + { .input = "carrot", .expected_ids = { 3 } }, + { .input = "apple banana", .expected_ids = { 1, 2 } }, + { .input = "carrot durian apple", .expected_ids = { 1, 3, 4 } }, + { .input = "carrot fig apple", .expected_ids = { 1, 3, 6 } }, + + /* leading characters and an incomplete trailing match */ + { .input = "mumble mumble fig hazelnut banana xigua watermelo", .expected_ids = { 2, 6, 8 } }, + + /* redundant matches */ + { .input = "ube ube ube ube ube", .expected_ids = { 21 } }, + + /* everything */ + { .input = + "apple banana carrot durian eggplant fig grapefruit " + "hazelnut iceberg lettuce jicamaa kiwano lemon mango " + "nectarine orange plum quince radish strawberry " + "turnip ube vanilla watermelon xigua watermelon yam zucchini", + .expected_ids = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + }, + }, + /* everything, only spaces appearing in patterns */ + { .input = + "applebananacarrotdurianeggplantfiggrapefruit" + "hazelnuticeberg lettucejicamaakiwanolemonmango" + "nectarineorangeplumquinceradishstrawberry" + "turnipubevanillawatermelonxigua watermelonyamzucchini", + .expected_ids = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + }, + }, + }, + }; + + /* truncate patterns to the first N */ + if (force_endids) { + assert(force_endid_count > 0 && force_endid_count <= 26); + test.patterns[force_endid_count] = NULL; + + /* truncate test inputs to just the first couple, since + * later inputs use later patterns */ + test.inputs[5].input = NULL; + } + + return run_test(&test, false, force_endids); +} diff --git a/tests/eager_output/eager_output_at_start.c b/tests/eager_output/eager_output_at_start.c new file mode 100644 index 000000000..407aa4e77 --- /dev/null +++ b/tests/eager_output/eager_output_at_start.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "" }, + .inputs = { + { .input = "", .expected_ids = { 1 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr1.c b/tests/eager_output/eager_output_fr1.c new file mode 100644 index 000000000..e8e5f3395 --- /dev/null +++ b/tests/eager_output/eager_output_fr1.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "ab", "" }, + .inputs = { + { .input = "ab", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr2.c b/tests/eager_output/eager_output_fr2.c new file mode 100644 index 000000000..404e98644 --- /dev/null +++ b/tests/eager_output/eager_output_fr2.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "", "" }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_fr3.c b/tests/eager_output/eager_output_fr3.c new file mode 100644 index 000000000..c7e4127a6 --- /dev/null +++ b/tests/eager_output/eager_output_fr3.c @@ -0,0 +1,13 @@ +#include "utils.h" + +/* Fuzzer regresison */ +int main(void) +{ + struct eager_output_test test = { + .patterns = { "^", "" }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_mixed_anchored_unanchored.c b/tests/eager_output/eager_output_mixed_anchored_unanchored.c new file mode 100644 index 000000000..a586f9840 --- /dev/null +++ b/tests/eager_output/eager_output_mixed_anchored_unanchored.c @@ -0,0 +1,46 @@ +#include "utils.h" + +int main(void) +{ + /* fprintf(stderr, "%s: skipping for now, this doesn't pass yet.\n", __FILE__); */ + /* return EXIT_SUCCESS; */ + + struct eager_output_test test = { + .patterns = { + "^abc$", + "def", + "^ghi", + "jkl$", + "mno", + }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "def", .expected_ids = { 2 } }, + { .input = "ghi", .expected_ids = { 3 } }, + { .input = "jkl", .expected_ids = { 4 } }, + { .input = "mno", .expected_ids = { 5 } }, + + { .input = "defmno", .expected_ids = { 2, 5 } }, + { .input = " def mno ", .expected_ids = { 2, 5 } }, + + /* Matching a start-anchored pattern followed by + * unanchored ones should just work. */ + { .input = "ghi def", .expected_ids = { 2, 3 } }, + + /* An unanchored pattern before a start-anchored pattern + * should only match the unanchored pattern. */ + { .input = "def ghi", .expected_ids = { 2 } }, + + /* Matching an unanchored pattern before an + * end-anchored one is fine. */ + { .input = "mno jkl", .expected_ids = { 4, 5 } }, + + /* This should match "mno" with the "jkl" prefix + * ignored by the unanchored start, which does + * not count as a match for "jkl$". */ + { .input = "jkl mno", .expected_ids = { 5 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c new file mode 100644 index 000000000..4bee8d848 --- /dev/null +++ b/tests/eager_output/utils.c @@ -0,0 +1,278 @@ +#include "utils.h" + +void +fsm_eager_output_dump(FILE *f, const struct fsm *fsm); + +void +fsm_endid_dump(FILE *f, const struct fsm *fsm); + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque) +{ + struct cb_info *info = (struct cb_info *)opaque; + assert(info->used < MAX_IDS); + + for (size_t i = 0; i < info->used; i++) { + if (info->ids[i] == id) { + return; /* already present */ + } + } + + info->ids[info->used++] = id; +} + +int +cmp_output(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +struct fsm_options print_options = { + .consolidate_edges = 1, + .comments = 0, + .group_edges = 1, +}; + +void +dump(const struct fsm *fsm) +{ + fsm_print(stderr, fsm, + &print_options, NULL, FSM_PRINT_DOT); +} + +int +run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids) +{ + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + + allow_extra_outputs = false; + + size_t fsms_used = 0; + int ret = 0; + + int log = 0; + { + const char *logstr = getenv("LOG"); + if (logstr != NULL) { + if (logstr[0] == 'y') { /* make "y" or "yes" non-zero */ + logstr = "1"; + } + log = atoi(logstr); + } + } + + for (size_t i = 0; i < MAX_PATTERNS; i++) { + const char *p = test->patterns[i]; + if (test->patterns[i] == NULL) { break; } + const size_t len = strlen(p); + struct fsm_union_entry *e = &entries[fsms_used]; + + /* For sake of these patterns, they are anchored if the first/last + * character is '^' and '$', respectively. This is too simplistic + * for the general case, though. */ + if (len > 0) { + if (p[0] == '^') { e->anchored_start = true; } + if (p[len - 1] == '$') { e->anchored_end = true; } + /* fprintf(stderr, "%s: p[%zd]: '%s', start %d, end %d\n", */ + /* __func__, fsms_used, p, e->anchored_start, e->anchored_end); */ + } + + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + assert(fsm != NULL); + + /* Zero is used to terminate expected_ids, so don't use it here. */ + const fsm_output_id_t output_id = (fsm_output_id_t) (i + 1); + const fsm_end_id_t end_id = (fsm_end_id_t) (i + 1); + + /* Set either an end ID or an eager output ID, depending on + * whether the fsm is anchored at the end or not. */ + if (e->anchored_end || force_endids) { + ret = fsm_setendid(fsm, end_id); + } else { + ret = fsm_seteageroutputonends(fsm, output_id); + } + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (pre det+min)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_determinise(fsm); + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (post det)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_minimise(fsm); + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== source DFA %zd (post det+min)\n", i); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + e->fsm = fsm; + fsms_used++; + } + + /* If there's only one pattern this just returns fsms[0]. */ + struct fsm *fsm = fsm_union_repeated_pattern_group(fsms_used, entries, NULL); + assert(fsm != NULL); + + if (log) { + fprintf(stderr, "==== combined (pre det+min)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + if (log) { + fprintf(stderr, "=== determinising combined... NFA has %u states\n", fsm_countstates(fsm)); + } + ret = fsm_determinise(fsm); + assert(ret == 1); + if (log) { + fprintf(stderr, "=== determinising combined...done, DFA has %u states\n", fsm_countstates(fsm)); + } + + if (log) { + fprintf(stderr, "==== combined (post det)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + ret = fsm_minimise(fsm); + if (log) { + fprintf(stderr, "=== minimised combined...done, DFA has %u states\n", fsm_countstates(fsm)); + } + assert(ret == 1); + + if (log) { + fprintf(stderr, "==== combined (post det+min)\n"); + if (log > 1) { dump(fsm); } + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } + + struct cb_info outputs = { 0 }; + fsm_eager_output_set_cb(fsm, append_eager_output_cb, &outputs); + + for (size_t i_i = 0; i_i < MAX_INPUTS; i_i++) { + outputs.used = 0; + const char *input = test->inputs[i_i].input; + if (input == NULL) { break; } + + size_t expected_id_count = 0; + for (size_t id_i = 0; id_i < MAX_ENDIDS; id_i++) { + const fsm_output_id_t id = test->inputs[i_i].expected_ids[id_i]; + if (id == 0) { break; } + expected_id_count++; + + /* must be ascending */ + if (id_i > 0) { + assert(id > test->inputs[i_i].expected_ids[id_i - 1]); + } + } + + if (log) { + fprintf(stderr, "%s: input %zd: \"%s\", expecting %zd ids:", + __func__, i_i, input, expected_id_count); + for (size_t i = 0; i < expected_id_count; i++) { + fprintf(stderr, " %d", test->inputs[i_i].expected_ids[i]); + } + } + + if (test->inputs[i_i].expect_fail) { + expected_id_count = 0; + } + + fsm_state_t end; /* only set on match */ + ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + + if (ret == 1) { +#define ENDID_BUF_SIZE 32 + fsm_end_id_t endid_buf[ENDID_BUF_SIZE] = {0}; + const size_t endid_count = fsm_endid_count(fsm, end); + /* fprintf(stderr, "%s: endid_count %zd for state %d\n", __func__, endid_count, end); */ + assert(endid_count < ENDID_BUF_SIZE); + if (!fsm_endid_get(fsm, end, /*ENDID_BUF_SIZE*/ endid_count, endid_buf)) { + assert(!"fsm_endid_get failed"); + } + + /* Copy endid outputs into outputs.ids[], since for testing + * purposes we don't care about the difference between eager + * output and endids here -- the values don't overlap. */ + assert(outputs.used + endid_count <= MAX_IDS); + for (size_t endid_i = 0; endid_i < endid_count; endid_i++) { + if (log) { + fprintf(stderr, "-- adding endid %zd: %d\n", endid_i, endid_buf[endid_i]); + } + outputs.ids[outputs.used++] = (fsm_output_id_t)endid_buf[endid_i]; + } + } + + if (ret == 0) { + /* if it didn't match, ignore the eager output IDs. this should + * eventually happen internal to fsm_exec or codegen. */ + outputs.used = 0; + } + + /* NEXT match IDs, sort outputs[] buffer first */ + qsort(outputs.ids, outputs.used, sizeof(outputs.ids[0]), cmp_output); + + if (log) { + fprintf(stderr, "-- got %zd:", outputs.used); + for (size_t i = 0; i < outputs.used; i++) { + fprintf(stderr, " %d", outputs.ids[i]); + } + fprintf(stderr, "\n"); + } + + if (expected_id_count == 0) { + assert(ret == 0 || outputs.used == 0); /* no match */ + continue; + } else { + assert(ret == 1); + } + + if (!allow_extra_outputs) { + assert(outputs.used == expected_id_count); + } else { + assert(outputs.used >= expected_id_count); + } + + size_t floor = 0; + for (size_t exp_i = 0; exp_i < outputs.used; exp_i++) { + bool found = false; + for (size_t got_i = floor; got_i < outputs.used; got_i++) { + if (outputs.ids[got_i] == test->inputs[i_i].expected_ids[exp_i]) { + floor = got_i + 1; + found = true; + break; + } + } + assert(found); + } + } + + fsm_free(fsm); + + return EXIT_SUCCESS;; +} diff --git a/tests/eager_output/utils.h b/tests/eager_output/utils.h new file mode 100644 index 000000000..672c01977 --- /dev/null +++ b/tests/eager_output/utils.h @@ -0,0 +1,64 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#define MAX_IDS 32 + +#include + +#include + +#define MAX_PATTERNS 150 +#define MAX_INPUTS 64 +#define MAX_ENDIDS 32 + +struct eager_output_test { + const char *patterns[MAX_PATTERNS]; + + struct { + const char *input; + bool expect_fail; + /* Terminated by 0. pattern[i] => id of i+1. Must be sorted. */ + fsm_output_id_t expected_ids[MAX_ENDIDS]; + } inputs[MAX_INPUTS]; +}; + +void +append_eager_output_cb(fsm_output_id_t id, void *opaque); + +int +cmp_output(const void *pa, const void *pb); + +int +run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids); + +struct cb_info { + size_t used; + fsm_end_id_t ids[MAX_IDS]; +}; + +void +dump(const struct fsm *fsm); + +void +append_eager_output_cb(fsm_end_id_t id, void *opaque); + +#endif From 74907caa28f7e241a3cdf0cf81242f8a1fd4b636 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 10 Oct 2024 13:32:38 -0400 Subject: [PATCH 31/80] Ensure .has_eager_outputs is zeroed on new states. (msan) --- src/libfsm/state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libfsm/state.c b/src/libfsm/state.c index d96c33653..8f1146038 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -65,6 +65,7 @@ fsm_addstate(struct fsm *fsm, fsm_state_t *state) new->visited = 0; new->epsilons = NULL; new->edges = NULL; + new->has_eager_outputs = 0; } fsm->statecount++; From f2ddf1d3ab7e74bcbbe431d05f505aa19a04268d Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 10 Oct 2024 14:15:33 -0400 Subject: [PATCH 32/80] eager_output interface cleanup: Replace _any with _count and _get. fsm_eager_output_count and fsm_eager_output_get aligns better with the endid interface, and now fsm_eager_output_get ensures the buffer contents are sorted, --- include/fsm/fsm.h | 10 ++++++ src/libfsm/eager_output.c | 69 +++++++++++++++++++++++++++------------ src/libfsm/eager_output.h | 4 --- src/libfsm/epsilons.c | 3 +- src/libfsm/libfsm.syms | 2 ++ src/libfsm/minimise.c | 4 +-- src/libfsm/print/ir.c | 30 ++++------------- src/libfsm/union.c | 3 +- 8 files changed, 73 insertions(+), 52 deletions(-) diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index b862cb041..f78d91d71 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -303,6 +303,16 @@ fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque); void fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque); +/* Get the number of eager output IDs associated with a state. */ +size_t +fsm_eager_output_count(const struct fsm *fsm, fsm_state_t state); + +/* Get eager output associated with a state. It's expected that buf[] has + * sufficient space -- call fsm_eager_output_count first to get the count. + * The contents of buf will be sorted and unique. */ +void +fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, fsm_output_id_t *buf); + /* * Find the state (if there is just one), or add epsilon edges from all states, * for which the given predicate is true. diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c index e37a8a4bf..e00e96cd1 100644 --- a/src/libfsm/eager_output.c +++ b/src/libfsm/eager_output.c @@ -276,6 +276,55 @@ fsm_eager_output_iter_state(const struct fsm *fsm, } } +static int +inc_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + (void)state; + (void)id; + size_t *count = opaque; + (*count)++; + return 1; +} + +/* Get the number of eager output IDs associated with a state. */ +size_t +fsm_eager_output_count(const struct fsm *fsm, fsm_state_t state) +{ + size_t res = 0; + fsm_eager_output_iter_state(fsm, state, inc_cb, (void *)&res); + return res; +} + +struct get_env { + size_t count; + fsm_output_id_t *buf; +}; + +static int +append_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) +{ + struct get_env *env = opaque; + (void)state; + env->buf[env->count++] = id; + return 1; +} + +static int +cmp_fsm_output_id_t(const void *pa, const void *pb) +{ + const fsm_output_id_t a = *(fsm_output_id_t *)pa; + const fsm_output_id_t b = *(fsm_output_id_t *)pb; + return a < b ? -1 : a > b ? 1 : 0; +} + +void +fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, fsm_output_id_t *buf) +{ + struct get_env env = { .buf = buf }; + fsm_eager_output_iter_state(fsm, state, append_cb, &env); + qsort(buf, env.count, sizeof(buf[0]), cmp_fsm_output_id_t); +} + void fsm_eager_output_iter_all(const struct fsm *fsm, fsm_eager_output_iter_cb *cb, void *opaque) @@ -327,26 +376,6 @@ fsm_eager_output_dump(FILE *f, const struct fsm *fsm) fprintf(f, "== %zu total\n", env.count); } -static int -inc_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) -{ - (void)state; - (void)id; - size_t *count = opaque; - (*count)++; - return 1; -} - -bool -fsm_eager_output_has_any(const struct fsm *fsm, - fsm_state_t state, size_t *count) -{ - size_t c = 0; - fsm_eager_output_iter_state(fsm, state, &inc_cb, &c); - if (count != NULL) { *count = c; } - return c > 0; -} - int fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count) { diff --git a/src/libfsm/eager_output.h b/src/libfsm/eager_output.h index 1b48ba4c4..6093adc9e 100644 --- a/src/libfsm/eager_output.h +++ b/src/libfsm/eager_output.h @@ -36,10 +36,6 @@ void fsm_eager_output_iter_all(const struct fsm *fsm, fsm_eager_output_iter_cb *cb, void *opaque); -bool -fsm_eager_output_has_any(const struct fsm *fsm, - fsm_state_t state, size_t *count); - int fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count); diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index adfcdec2a..8041c29d3 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -222,7 +222,8 @@ fsm_remove_epsilons(struct fsm *nfa) * in the current state's epsilon closure to the * current state. These will be added at the end. */ { - if (fsm_eager_output_has_any(nfa, es_id, NULL)) { + const size_t count = fsm_eager_output_count(nfa, es_id); + if (count > 0) { fsm_eager_output_iter_state(nfa, es_id, collect_eager_output_ids_cb, &eager_output_buf); if (!eager_output_buf.ok) { goto cleanup; } } diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index f645c4ceb..ab28b0a21 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -102,9 +102,11 @@ fsm_endid_dump fsm_seteageroutput fsm_seteageroutputonends +fsm_eager_output_count # short term hack fsm_eager_output_set_cb fsm_eager_output_dump +fsm_eager_output_get fsm_countedges fsm_countstates diff --git a/src/libfsm/minimise.c b/src/libfsm/minimise.c index 86f00b46f..a2ff1b818 100644 --- a/src/libfsm/minimise.c +++ b/src/libfsm/minimise.c @@ -1026,8 +1026,8 @@ static int collect_eager_output_ids(const struct fsm *fsm, fsm_state_t state, struct end_metadata_eager_outputs *e) { - size_t count = 0; - if (!fsm_eager_output_has_any(fsm, state, &count)) { + size_t count = fsm_eager_output_count(fsm, state); + if (count == 0) { return 1; /* nothing to do */ } diff --git a/src/libfsm/print/ir.c b/src/libfsm/print/ir.c index 81d5890e0..a18dadbbc 100644 --- a/src/libfsm/print/ir.c +++ b/src/libfsm/print/ir.c @@ -506,23 +506,6 @@ make_example(const struct fsm *fsm, fsm_state_t s, char **example) return 0; } -static int -append_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) -{ - struct ir_state_eager_output *outputs = opaque; - (void)state; - outputs->ids[outputs->count++] = id; - return 1; -} - -static int -cmp_fsm_output_id_t(const void *pa, const void *pb) -{ - const fsm_output_id_t a = *(fsm_output_id_t *)pa; - const fsm_output_id_t b = *(fsm_output_id_t *)pb; - return a < b ? -1 : a > b ? 1 : 0; -} - struct ir * make_ir(const struct fsm *fsm, const struct fsm_options *opt) { @@ -587,17 +570,16 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) ir->states[i].endids.count = count; } - size_t count; - if (fsm_eager_output_has_any(fsm, i, &count)) { + const size_t eager_output_count = fsm_eager_output_count(fsm, i); + if (eager_output_count > 0) { struct ir_state_eager_output *outputs = f_malloc(fsm->alloc, - sizeof(*outputs) + count * sizeof(outputs->ids[0])); + sizeof(*outputs) + eager_output_count * sizeof(outputs->ids[0])); if (outputs == NULL) { goto error; } - outputs->count = 0; - fsm_eager_output_iter_state(fsm, i, append_eager_output_cb, outputs); - assert(outputs->count == count); - qsort(outputs->ids, outputs->count, sizeof(outputs->ids[0]), cmp_fsm_output_id_t); + fsm_eager_output_get(fsm, i, outputs->ids); + outputs->count = eager_output_count; + ir->states[i].eager_outputs = outputs; } diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 0b18cd30c..126181992 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -287,7 +287,8 @@ fsm_union_repeated_pattern_group(size_t entry_count, /* for any state that has eager outputs and a self edge, * remove the self edge before further linkage */ for (fsm_state_t s = 0; s < fsm->statecount; s++) { - if (!fsm_eager_output_has_any(fsm, s, NULL)) { continue; } + const size_t eager_output_count = fsm_eager_output_count(fsm, s); + if (eager_output_count == 0) { continue; } struct edge_set *edges = fsm->states[s].edges; struct edge_set *new = edge_set_new(); From 981128f3b6456f9faeb8633f0728afe113785123 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 10 Oct 2024 14:16:59 -0400 Subject: [PATCH 33/80] minimise_test_oracle.c: mismatched eager outputs also prevent merging. Do the same check with eager outputs as it does with endids -- any two states with distinct sets for either should end up in different equivalence classes. --- src/libfsm/minimise_test_oracle.c | 107 ++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/src/libfsm/minimise_test_oracle.c b/src/libfsm/minimise_test_oracle.c index ec6d0d83c..20d4633a1 100644 --- a/src/libfsm/minimise_test_oracle.c +++ b/src/libfsm/minimise_test_oracle.c @@ -109,13 +109,19 @@ fsm_minimise_test_oracle(const struct fsm *fsm) fsm_state_t *tmp_map = NULL; fsm_state_t *mapping = NULL; - /* endid_group_assignments[X] = Y: state X is in endid group Y - * endid_group_leaders[X] = Y: see end state Y for endid group X */ - unsigned *endid_group_assignments = NULL; - size_t endid_group_count = 1; /* group 0 is the empty set */ - unsigned *endid_group_leaders = NULL; + /* End metadata grouping: The fixpoint algorithm here isn't aware + * of endids or eager outputs associated with particular states, + * so do a pass grouping them by matching end metadata. + * + * end_md_group_assignments[X] = Y: state X is in end_md group Y + * end_md_group_leaders[X] = Y: see end state Y for end_md group X */ + unsigned *end_md_group_assignments = NULL; + size_t end_md_group_count = 1; /* group 0 is the empty set */ + unsigned *end_md_group_leaders = NULL; fsm_end_id_t *ids_a = NULL; fsm_end_id_t *ids_b = NULL; + fsm_output_id_t *eo_ids_a = NULL; + fsm_output_id_t *eo_ids_b = NULL; table = calloc(row_words * table_states, sizeof(table[0])); if (table == NULL) { goto cleanup; } @@ -126,11 +132,11 @@ fsm_minimise_test_oracle(const struct fsm *fsm) mapping = malloc(state_count * sizeof(mapping[0])); if (mapping == NULL) { goto cleanup; } - endid_group_assignments = calloc(state_count, sizeof(tmp_map[0])); - if (endid_group_assignments == NULL) { goto cleanup; } + end_md_group_assignments = calloc(state_count, sizeof(tmp_map[0])); + if (end_md_group_assignments == NULL) { goto cleanup; } - endid_group_leaders = calloc(state_count, sizeof(tmp_map[0])); - if (endid_group_leaders == NULL) { goto cleanup; } + end_md_group_leaders = calloc(state_count, sizeof(tmp_map[0])); + if (end_md_group_leaders == NULL) { goto cleanup; } /* macros for NxN bit table */ #define POS(X,Y) ((X*table_states) + Y) @@ -139,6 +145,7 @@ fsm_minimise_test_oracle(const struct fsm *fsm) #define CHECK(X,Y) u64bitset_get(table, POS(X,Y)) size_t max_endid_count = 0; + size_t max_eager_output_count = 0; /* Mark all pairs of states where one is final and one is not. * This includes the dead state. */ @@ -150,6 +157,12 @@ fsm_minimise_test_oracle(const struct fsm *fsm) } } + /* count eager outputs, not just on end states */ + const size_t eo_count = fsm_eager_output_count(fsm, i); + if (eo_count > max_eager_output_count) { + max_eager_output_count = eo_count; + } + for (size_t j = 0; j < i; j++) { const bool end_i = i == dead_state ? false : fsm_isend(fsm, i); @@ -171,50 +184,70 @@ fsm_minimise_test_oracle(const struct fsm *fsm) ids_b = malloc(max_endid_count * sizeof(ids_b[0])); if (ids_b == NULL) { goto cleanup; } - /* For every end state, check if it has endids. If not, assign it - * to endid group 0 (none). Otherwise, check if its endids match - * any of the other end states. If so, assign it to the same endid + eo_ids_a = malloc(max_eager_output_count * sizeof(eo_ids_a[0])); + if (eo_ids_a == NULL) { goto cleanup; } + eo_ids_b = malloc(max_eager_output_count * sizeof(eo_ids_b[0])); + if (eo_ids_b == NULL) { goto cleanup; } + + /* For every end state, check if it has endids or eager outputs. + * If not, assign it to group 0 (none). Otherwise, check if its IDs match + * any of the other end states. If so, assign it to the same * group, otherwise assign a new one and mark it as the leader. */ for (size_t i = 0; i < state_count; i++) { if (!fsm_isend(fsm, i)) { - endid_group_assignments[i] = 0; /* none */ + end_md_group_assignments[i] = 0; /* none */ continue; } - size_t count_a = fsm_endid_count(fsm, i); - assert(count_a <= max_endid_count); - if (count_a == 0) { + const size_t endid_count_a = fsm_endid_count(fsm, i); + assert(endid_count_a <= max_endid_count); + + const size_t eager_output_count_a = fsm_eager_output_count(fsm, i); + assert(eager_output_count_a <= max_eager_output_count); + + if (endid_count_a == 0 && eager_output_count_a == 0) { continue; } - int eres = fsm_endid_get(fsm, i, count_a, ids_a); + int eres = fsm_endid_get(fsm, i, endid_count_a, ids_a); assert(eres == 1); + fsm_eager_output_get(fsm, i, eo_ids_a); + bool found = false; /* note: skipping eg 0 here since that's the empty set */ - for (size_t eg_i = 1; eg_i < endid_group_count; eg_i++) { - size_t count_b = fsm_endid_count(fsm, endid_group_leaders[eg_i]); - if (count_b != count_a) { + for (size_t eg_i = 1; eg_i < end_md_group_count; eg_i++) { + size_t endid_count_b = fsm_endid_count(fsm, end_md_group_leaders[eg_i]); + if (endid_count_b != endid_count_a) { continue; } - assert(count_b > 0); - assert(count_b <= max_endid_count); - eres = fsm_endid_get(fsm, endid_group_leaders[eg_i], - count_b, ids_b); + const size_t eager_output_count_b = fsm_eager_output_count(fsm, end_md_group_leaders[eg_i]); + assert(eager_output_count_b <= max_eager_output_count); + if (eager_output_count_b != eager_output_count_a) { + continue; + } + + assert(endid_count_b > 0 || eager_output_count_b > 0); + assert(endid_count_b <= max_endid_count); + eres = fsm_endid_get(fsm, end_md_group_leaders[eg_i], + endid_count_b, ids_b); assert(eres == 1); - if (0 == memcmp(ids_a, ids_b, count_a * sizeof(ids_a[0]))) { + fsm_eager_output_get(fsm, end_md_group_leaders[eg_i], eo_ids_b); + + if ((0 == memcmp(ids_a, ids_b, endid_count_a * sizeof(ids_a[0]))) && + (0 == memcmp(eo_ids_a, eo_ids_b, eager_output_count_a * sizeof(eo_ids_a[0])))) { found = true; - endid_group_assignments[i] = eg_i; + end_md_group_assignments[i] = eg_i; break; } } if (!found) { - endid_group_assignments[i] = endid_group_count; - endid_group_leaders[endid_group_count] = i; - endid_group_count++; + end_md_group_assignments[i] = end_md_group_count; + end_md_group_leaders[end_md_group_count] = i; + end_md_group_count++; } } @@ -222,10 +255,10 @@ fsm_minimise_test_oracle(const struct fsm *fsm) * group must be distinguishable. */ for (size_t i = 0; i < state_count; i++) { if (fsm_isend(fsm, i)) { - const unsigned i_group = endid_group_assignments[i]; + const unsigned i_group = end_md_group_assignments[i]; for (size_t j = 0; j < i; j++) { if (fsm_isend(fsm, j)) { - const unsigned j_group = endid_group_assignments[j]; + const unsigned j_group = end_md_group_assignments[j]; if (i_group != j_group) { MARK(i, j); } @@ -359,10 +392,12 @@ fsm_minimise_test_oracle(const struct fsm *fsm) free(table); free(tmp_map); free(mapping); - free(endid_group_assignments); - free(endid_group_leaders); + free(end_md_group_assignments); + free(end_md_group_leaders); free(ids_a); free(ids_b); + free(eo_ids_a); + free(eo_ids_b); return res; @@ -370,10 +405,12 @@ fsm_minimise_test_oracle(const struct fsm *fsm) if (table != NULL) { free(table); } if (tmp_map != NULL) { free(tmp_map); } if (mapping != NULL) { free(mapping); } - if (endid_group_assignments != NULL) { free(endid_group_assignments); } - if (endid_group_leaders != NULL) { free(endid_group_leaders); } + if (end_md_group_assignments != NULL) { free(end_md_group_assignments); } + if (end_md_group_leaders != NULL) { free(end_md_group_leaders); } if (ids_a != NULL) { free(ids_a); } if (ids_b != NULL) { free(ids_b); } if (res != NULL) { fsm_free(res); } + free(eo_ids_a); + free(eo_ids_b); return NULL; } From fa63fcd1fe272c1e7f68dbd48c25b40954945a90 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Sat, 12 Oct 2024 13:54:23 -0400 Subject: [PATCH 34/80] fuzz/target.c: re_is_anchor interface changes. --- fuzz/target.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fuzz/target.c b/fuzz/target.c index d56a9bf82..53b6003c1 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -503,13 +503,14 @@ fuzz_eager_output(const uint8_t *data, size_t size) } } - struct re_anchoring_info anchorage[MAX_PATTERNS] = {0}; + enum re_is_anchored_res anchorage[MAX_PATTERNS] = {0}; /* for each pattern, attempt to compile to a DFA */ for (size_t p_i = 0; p_i < env.pattern_count; p_i++) { const char *p = env.patterns[p_i]; - if (!re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL, &anchorage[p_i])) { + enum re_is_anchored_res a = re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL); + if (a == RE_IS_ANCHORED_ERROR) { continue; /* unsupported regex */ } @@ -599,8 +600,8 @@ fuzz_eager_output(const uint8_t *data, size_t size) } entries[used].fsm = cp; - entries[used].anchored_start = anchorage[i].start; - entries[used].anchored_end = anchorage[i].end; + entries[used].anchored_start = anchorage[i] & RE_IS_ANCHORED_START; + entries[used].anchored_end = anchorage[i] & RE_IS_ANCHORED_END; used++; } From 496198d5b19230174802e69a62c30f27ae8ff25b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 10 Oct 2024 15:48:13 -0400 Subject: [PATCH 35/80] fuzzer: Add seed argument for fsm_generate_matches (interface change). --- fuzz/target.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fuzz/target.c b/fuzz/target.c index 53b6003c1..4ff8b63bd 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -446,6 +446,8 @@ fuzz_eager_output(const uint8_t *data, size_t size) size_t max_pattern_length = 0; + const unsigned seed = size == 0 ? 0 : data[0]; + /* chop data into a series of patterns */ { size_t prev = 0; @@ -646,7 +648,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) * Use the combined DFA to generate matches, check that the * match behavior agrees with the individual DFA copies. */ env.current_pattern = (size_t)-1; - if (!fsm_generate_matches(env.combined, max_pattern_length, gen_combined_check_individual_cb, &env)) { + if (!fsm_generate_matches(env.combined, max_pattern_length, seed, gen_combined_check_individual_cb, &env)) { goto cleanup; } @@ -656,7 +658,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) /* check behavior against the combined DFA. */ for (size_t i = 0; i < env.pattern_count; i++) { env.current_pattern = i; - if (!fsm_generate_matches(env.combined, max_pattern_length, gen_individual_check_combined_cb, &env)) { + if (!fsm_generate_matches(env.combined, max_pattern_length, seed, gen_individual_check_combined_cb, &env)) { goto cleanup; } } From e2b913072545ee52e92efbc5f41f90c90cb44549 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 31 Jan 2025 15:09:24 -0500 Subject: [PATCH 36/80] Fix memory leak in fsm_eager_output_compact, found while fuzzing. --- src/libfsm/eager_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c index e00e96cd1..00fa1b5f0 100644 --- a/src/libfsm/eager_output.c +++ b/src/libfsm/eager_output.c @@ -408,7 +408,10 @@ fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_c assert(ob->state < mapping_count); const fsm_state_t nstate = mapping[ob->state]; - if (nstate == FSM_STATE_REMAP_NO_STATE) { continue; } + if (nstate == FSM_STATE_REMAP_NO_STATE) { + f_free(fsm->alloc, ob->entry); + continue; + } const uint64_t hash = hash_id(nstate); From 36d018726f3d5f2e252ee3c866865e0ffdbe8c28 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 4 Feb 2025 12:45:08 -0500 Subject: [PATCH 37/80] Fix fsm_union_repeated_pattern_group's anchoring linkage. Previously this didn't handle mixed anchoring correctly, potentially leading to false positives the case represented by eager_output_alt_mixing_anchored_and_unanchored.c. See comments in fsm_union_repeated_pattern_group for details. Fuzzing did not turn up any new issues. Another commit after this will make a few small interface changes and update callers. --- src/libfsm/union.c | 751 ++++++++++++++---- ...utput_alt_mixing_anchored_and_unanchored.c | 29 + .../eager_output_at_start_multiple_anchored.c | 15 + .../eager_output_at_start_single_anchored.c | 22 + tests/eager_output/utils.c | 90 +-- 5 files changed, 716 insertions(+), 191 deletions(-) create mode 100644 tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c create mode 100644 tests/eager_output/eager_output_at_start_multiple_anchored.c create mode 100644 tests/eager_output/eager_output_at_start_single_anchored.c diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 126181992..ed71384ff 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -21,9 +21,54 @@ #include "internal.h" #include +#include + #include "eager_output.h" +#include "endids.h" #define LOG_UNION_ARRAY 0 +#define LOG_ANALYZE_GROUP_NFA_RESULTS 0 +#define LOG_UNION_REPEATED_PATTERN_GROUP 0 + +#define NO_STATE (fsm_state_t)-1 + +/* State/edge info gathered about an NFA. Used by fsm_union_repeated_pattern_group. */ +struct analysis_info { + bool nullable; /* Does the NFA match the empty string? */ + fsm_state_t start; /* start state */ + + /* The states with a /./ self edge representing the unanchored + * start and end, or NO_STATE. There can be at most one of each. */ + fsm_state_t unanchored_start_loop; + fsm_state_t unanchored_end_loop; + + /* The end state following the unanchored end loop. */ + fsm_state_t unanchored_end_loop_end; + + /* State that links to paths only reachable from the beginning of input. */ + fsm_state_t anchored_start; + + /* States leading to an anchored end. */ + struct state_set *anchored_ends; + + /* States with an outgoing labeled edge to the unanchored end loop. Input + * following those edges has matched, but may still consume trailing input. + * These edges correspond to edges leaving capture group 0 in PCRE. */ + struct state_set *eager_matches; + + /* Edges leading to states that can only match at the start of input. */ + struct edge_set *anchored_firsts; + + /* Edges leading to states that can begin an unanchored match, + * potentially after other combined patterns have matched. */ + struct edge_set *repeatable_firsts; + + /* A new state that may be added while replacing the unanchored_end_loop, + * if present. This state exists to set an eager output ID and have an + * epsilon edge to the combined NFA's global_unanchored_end_loop. + * The old unanchored_end_loop will be disconnected. */ + fsm_state_t eager_match_state; +}; struct fsm * fsm_union(struct fsm *a, struct fsm *b, @@ -157,10 +202,441 @@ fsm_union_array(size_t fsm_count, return res; } -#define LOG_UNION_REPEATED_PATTERN_GROUP 0 +static bool +has_dot_self_edge(const struct fsm *nfa, fsm_state_t s_i) +{ + const struct fsm_state *s = &nfa->states[s_i]; + + struct edge_group_iter ei; + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &ei); + struct edge_group_iter_info info; + while (edge_set_group_iter_next(&ei, &info)) { + if (info.to != s_i) { continue; } + for (size_t i = 0; i < 256/64; i++) { + if (info.symbols[i] != (uint64_t)-1) { continue; } + } + return true; + } + + return false; +} + +#if LOG_ANALYZE_GROUP_NFA_RESULTS +static void +dump_state_set(FILE *f, const char *name, const struct state_set *set) +{ + struct state_iter si; + fsm_state_t s; + if (state_set_empty(set)) { return; } + + fprintf(f, " - %s:", name); + state_set_reset(set, &si); + while (state_set_next(&si, &s)) { + fprintf(f, " %d", s); + } + fprintf(f, "\n"); +} + +static void +dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set *edges) +{ + struct edge_group_iter iter; + struct edge_group_iter_info info; + if (edge_set_empty(edges)) { return; } + + fprintf(f, " - %s:", name); + edge_set_group_iter_reset(edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + fprintf(f, " %d->%d", from, info.to); + } + fprintf(f, "\n"); +} +#endif + +/* If there's a labeled edge to an end state, check if the label set is + * only [\n] and there's also an epsilon edge to the same end state. + * This represents an anchored end in the NFA. */ +static bool +has_epsilon_and_newline_edges_to_end(const struct fsm *nfa, fsm_state_t s_i, fsm_state_t *dst_end) +{ + assert(s_i < nfa->statecount); + const struct fsm_state *s = &nfa->states[s_i]; + + if (state_set_empty(s->epsilons)) { return false; } + if (edge_set_empty(s->edges)) { return false; } + + struct edge_group_iter iter; + struct edge_group_iter_info info; + + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + /* Look for an edge set with only '\n' */ + if ((info.symbols[0] != (1ULL << '\n')) + || info.symbols[1] || info.symbols[2] || info.symbols[3]) { + continue; + } + + if (fsm_isend(nfa, info.to)) { + struct state_iter si; + fsm_state_t os_i; + state_set_reset(s->epsilons, &si); + while (state_set_next(&si, &os_i)) { + if (os_i == info.to) { + *dst_end = info.to; + return true; + } + } + } + } + + return false; +} + +static bool +has_labeled_edge_to_unanchored_end_loop(const struct fsm *nfa, + fsm_state_t s_i, fsm_state_t unanchored_end_loop) +{ + if (unanchored_end_loop == NO_STATE) { return false; } + assert(unanchored_end_loop < nfa->statecount); + + /* The unanchored_end_loop's self-edge doesn't count here. */ + if (s_i == unanchored_end_loop) { return false; } + + assert(s_i < nfa->statecount); + const struct fsm_state *s = &nfa->states[s_i]; + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + if (info.to == unanchored_end_loop) { + return true; + } + } + return false; +} + +static bool +start_state_epsilon_closure_matches_empty_string__iter(const struct fsm *nfa, + fsm_state_t s_i, struct state_set **seen, bool *result) +{ + if (*result) { return true; } + if (!state_set_add(seen, nfa->alloc, s_i)) { return false; } + + if (fsm_isend(nfa, s_i)) { + *result = true; + return true; + } + + assert(s_i < nfa->statecount); + const struct fsm_state *s = &nfa->states[s_i]; + + struct state_iter si; + state_set_reset(s->epsilons, &si); + fsm_state_t ns_i; + while (state_set_next(&si, &ns_i)) { + if (!state_set_contains(*seen, ns_i)) { + if (!start_state_epsilon_closure_matches_empty_string__iter(nfa, ns_i, seen, result)) { + return false; + } + if (*result) { break; } + } + } + + return true; +} + +/* Does the start state's epsilon closure match the empty string? + * Returns false on error, otherwise returns true and sets *result. */ +static bool +start_state_epsilon_closure_matches_empty_string(const struct fsm *nfa, fsm_state_t start, bool *result) +{ + struct state_set *seen = NULL; /* empty set */ + if (!start_state_epsilon_closure_matches_empty_string__iter(nfa, start, &seen, result)) { return false; } + state_set_free(seen); + + return true; +} + +static bool +analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) +{ + memset(ainfo, 0x00, sizeof(*ainfo)); + ainfo->start = NO_STATE; + ainfo->unanchored_start_loop = NO_STATE; + ainfo->anchored_start = NO_STATE; + ainfo->unanchored_end_loop = NO_STATE; + ainfo->unanchored_end_loop_end = NO_STATE; + ainfo->eager_match_state = NO_STATE; + + if (!fsm_getstart(nfa, &ainfo->start)) { + return false; + } + + const size_t state_count = fsm_countstates(nfa); + assert(ainfo->start < state_count); + + { + const struct fsm_state *s = &nfa->states[ainfo->start]; + + struct state_iter si; + state_set_reset(s->epsilons, &si); + fsm_state_t ns_i; + while (state_set_next(&si, &ns_i)) { + if (ns_i == ainfo->start) { continue; } + + struct edge_group_iter egi; + struct edge_group_iter_info info; + + assert(ns_i < state_count); + const struct fsm_state *ns = &nfa->states[ns_i]; + + /* if there's a state in the start state's epsilon closure that + * has a dot self-edge, it's the unanchored start loop */ + if (has_dot_self_edge(nfa, ns_i)) { + assert(ainfo->unanchored_start_loop == NO_STATE); + ainfo->unanchored_start_loop = ns_i; + + /* copy its non-self labeled edges to ainfo->repeatable_firsts */ + edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + if (info.to != ns_i) { + if (!edge_set_add_bulk(&ainfo->repeatable_firsts, + nfa->alloc, info.symbols, info.to)) { + goto alloc_fail; + } + } + } + } else { + /* likewise, a state without a dot self-edge is the anchored start */ + assert(ainfo->anchored_start == NO_STATE); + ainfo->anchored_start = ns_i; + + /* copy its labeled edges to ainfo->anchored_firsts */ + edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + if (!edge_set_add_bulk(&ainfo->anchored_firsts, + nfa->alloc, info.symbols, info.to)) { + goto alloc_fail; + } + } + } + } + } + + /* If the start state always matches, set a flag noting that it will need special handling + * later. It's arguably pointless to combine "" with other regexes, because it will always + * trivially match, but otherwise it would never match. */ + if (!start_state_epsilon_closure_matches_empty_string(nfa, ainfo->start, &ainfo->nullable)) { + goto alloc_fail; + } + + /* If there's a state with a dot self-edge and an epsilon edge to an end state, it's + * the unanchored end loop. There should only be one. */ + for (size_t s_i = 0; s_i < state_count; s_i++) { + const struct fsm_state *s = &nfa->states[s_i]; + if (has_dot_self_edge(nfa, s_i)) { + struct state_iter si; + state_set_reset(s->epsilons, &si); + fsm_state_t ns_i; + while (state_set_next(&si, &ns_i)) { + if (fsm_isend(nfa, ns_i)) { + assert(ainfo->unanchored_end_loop == NO_STATE); + ainfo->unanchored_end_loop = s_i; + ainfo->unanchored_end_loop_end = ns_i; + break; + } + } + if (ainfo->unanchored_end_loop != NO_STATE) { break; } + } + } + + /* Collect states that lead to an anchored end or eager match. */ + for (size_t s_i = 0; s_i < state_count; s_i++) { + fsm_state_t dst_end = NO_STATE; + if (has_epsilon_and_newline_edges_to_end(nfa, s_i, &dst_end)) { + if (!state_set_add(&ainfo->anchored_ends, nfa->alloc, dst_end)) { + goto alloc_fail; + } + } + + if (has_labeled_edge_to_unanchored_end_loop(nfa, s_i, ainfo->unanchored_end_loop)) { + if (!state_set_add(&ainfo->eager_matches, nfa->alloc, s_i)) { + goto alloc_fail; + } + } + } -/* Combine an array of FSMs into a single FSM in one pass, with an extra loop - * so that more than one pattern with eager outputs can match. */ +#if LOG_ANALYZE_GROUP_NFA_RESULTS + { + fprintf(stderr, "# analysis_info start %d, usl %d, uel %d, uele %d\n", + ainfo->start, ainfo->unanchored_start_loop, ainfo->unanchored_end_loop, ainfo->unanchored_end_loop_end); + dump_state_set(stderr, "anchored_ends", ainfo->anchored_ends); + dump_state_set(stderr, "eager_matches", ainfo->eager_matches); + dump_edge_set(stderr, "anchored_firsts", ainfo->anchored_start, ainfo->anchored_firsts); + dump_edge_set(stderr, "repeatable_firsts", ainfo->unanchored_start_loop, ainfo->repeatable_firsts); + } +#endif + return true; + +alloc_fail: + fprintf(stderr, "alloc fail\n"); + return false; +} + +/* Replace any labeled edges on nfa->states[from_state] going to old_to + * with a new edge leading to new_to. There currently isn't a function in + * the libfsm API for this (and it shouldn't be necessary in general), but + * if it gets one later this can be replaced. */ +static bool +replace_labeled_edge(struct fsm *nfa, fsm_state_t from_state, fsm_state_t old_to, fsm_state_t new_to) +{ + if (old_to == NO_STATE) { + /* nothing to do */ + return true; + } + assert(new_to < nfa->statecount); + assert(from_state < nfa->statecount); + + struct fsm_state *from = &nfa->states[from_state]; + struct edge_set *old_edges = from->edges; + struct edge_set *new_edges = edge_set_new(); + + /* copy, replacing edges to old_to */ + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(old_edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + if (!edge_set_add_bulk(&new_edges, nfa->alloc, info.symbols, + info.to == old_to ? new_to : info.to)) { + return false; + } + } + + edge_set_free(nfa->alloc, old_edges); + from->edges = new_edges; + return true; +} + +/* Make a couple changes to the group NFA so that it can be combined correctly: + * + * - If the group NFA has an unanchored_end_loop, add a new state, + * eager_match_state, which will be a waypoint between edges that previously + * led to the unanchored_end_loop and the global NFA's global_unanchored_end_loop + * (so it can potentially also match other group NFAs with unanchored starts). + * This state will get an eager output ID. + * + * - Set an end ID on every anchored end state, so halting on these counts as a match. */ +static bool +modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t id_base) +{ + const bool nullable_and_unanchored_end = ainfo->nullable + && ainfo->unanchored_end_loop != NO_STATE; + + /* Add the eager match state if there are eager match states + * or a nullable unanchored end. This will link to the global NFA's + * unanchored_end_loop. */ + if (!state_set_empty(ainfo->eager_matches) || nullable_and_unanchored_end) { + if (!fsm_addstate(nfa, &ainfo->eager_match_state)) { + return false; + } + + /* Set eager match ID on new eager_match_state. */ + const fsm_output_id_t oid = (fsm_output_id_t)(id + id_base); + if (!fsm_seteageroutput(nfa, ainfo->eager_match_state, oid)) { + return false; + } + + /* For every state in eager_matches, replace every edge leading to + * the unanchored_end_loop with an edge with the same labels to + * eager_match_state. */ + struct state_iter si; + state_set_reset(ainfo->eager_matches, &si); + fsm_state_t ems_i; + while (state_set_next(&si, &ems_i)) { + if (!replace_labeled_edge(nfa, ems_i, + ainfo->unanchored_end_loop, ainfo->eager_match_state)) { + return false; + } + + /* The state must not link to the unanchored end loop anymore. + * Doing so will cause a combinatorial explosion that makes + * combining more ~10 NFAs incredibly expensive. */ + struct edge_group_iter iter; + struct edge_group_iter_info info; + assert(ems_i < nfa->statecount); + const struct fsm_state *ems = &nfa->states[ems_i]; + edge_set_group_iter_reset(ems->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + assert(info.to != ainfo->unanchored_end_loop); + } + } + } + + /* If the group NFA matches the empty string and has an unanchored end, then + * link its unanchored start state to the eager match state. This ensures + * all inputs will match this group NFA when combined. */ + if (nullable_and_unanchored_end) { + assert(ainfo->start != NO_STATE); + assert(ainfo->eager_match_state != NO_STATE); + + if (ainfo->unanchored_start_loop != NO_STATE) { + struct fsm_state *s = &nfa->states[ainfo->unanchored_start_loop]; + if (!state_set_add(&s->epsilons, nfa->alloc, ainfo->eager_match_state)) { + return false; + } + } + } + + /* If there are anchored ends, set an endid on them */ + if (!state_set_empty(ainfo->anchored_ends)) { + struct state_iter si; + state_set_reset(ainfo->anchored_ends, &si); + fsm_state_t anchored_end_state; + const fsm_end_id_t end_id = (fsm_end_id_t)(id + id_base); + while (state_set_next(&si, &anchored_end_state)) { + if (!fsm_endid_set(nfa, anchored_end_state, end_id)) { + return false; + } + } + } + + return true; +} + +static void +rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) +{ + if (base == 0) { return; } + +#define SHIFT(S) if (ainfo-> S != NO_STATE) { ainfo-> S += base; } + SHIFT(start); + SHIFT(unanchored_start_loop); + SHIFT(unanchored_end_loop); + SHIFT(unanchored_end_loop_end); + SHIFT(anchored_start); + SHIFT(eager_match_state); +#undef SHIFT + + state_set_rebase(&ainfo->anchored_ends, base); + state_set_rebase(&ainfo->eager_matches, base); + + edge_set_rebase(&ainfo->anchored_firsts, base); + edge_set_rebase(&ainfo->repeatable_firsts, base); + +} + +static void +free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) +{ + state_set_free(ainfo->anchored_ends); + state_set_free(ainfo->eager_matches); + edge_set_free(alloc, ainfo->anchored_firsts); + edge_set_free(alloc, ainfo->repeatable_firsts); +} + +/* Combine an array of FSMs into a single FSM that attempts to match them + * all in one pass, with an extra loop so that more than one pattern with + * eager outputs can match. */ struct fsm * fsm_union_repeated_pattern_group(size_t entry_count, struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases) @@ -168,9 +644,11 @@ fsm_union_repeated_pattern_group(size_t entry_count, const struct fsm_alloc *alloc = entries[0].fsm->alloc; const bool log = 0 || LOG_UNION_REPEATED_PATTERN_GROUP; - if (entry_count == 1) { - return entries[0].fsm; - } + /* TODO: make this an extra argument */ + const size_t id_base = 1; + + struct analysis_info *ainfos = f_calloc(alloc, entry_count, sizeof(ainfos[0])); + if (ainfos == NULL) { goto fail; } size_t est_total_states = 0; for (size_t i = 0; i < entry_count; i++) { @@ -183,205 +661,200 @@ fsm_union_repeated_pattern_group(size_t entry_count, est_total_states += count; } + for (size_t i = 0; i < entry_count; i++) { + struct fsm *fsm = entries[i].fsm; + + /* Identify various states in the NFA that will be relevant to combining. */ + if (!analyze_group_nfa(fsm, &ainfos[i])) { + goto fail; + } + + /* Change the NFA structure so it can better link into the combined FSM, + * and set endids and/or output IDs as appropriate. */ + if (!modify_group_nfa(fsm, i, &ainfos[i], id_base)) { + goto fail; + } + } + est_total_states += 5; /* new start and end, new unanchored start and end loops */ struct fsm *res = fsm_new_statealloc(alloc, est_total_states); if (res == NULL) { return NULL; } - /* collected end states */ - struct ends_buf { - size_t ceil; - size_t used; - fsm_state_t *states; - } ends = { .ceil = 0 }; - - /* The new overall start state, which will have an epsilon edge to... */ + /* The new overall start state */ fsm_state_t global_start; if (!fsm_addstate(res, &global_start)) { goto fail; } - /* states linking to the starts of unanchored and anchored subgraphs, respectively. */ - fsm_state_t global_start_loop, global_start_anchored; - if (!fsm_addstate(res, &global_start_loop)) { goto fail; } - if (!fsm_addstate(res, &global_start_anchored)) { goto fail; } + /* States linking to the starts of unanchored and anchored subgraphs, respectively. + * Matching other group NFAs loops back to the global_unanchored_start_loop, but + * patterns anchored at the ^start are only reachable via global_anchored_start. */ + fsm_state_t global_unanchored_start_loop, global_anchored_start; + if (!fsm_addstate(res, &global_unanchored_start_loop)) { goto fail; } + if (!fsm_addstate(res, &global_anchored_start)) { goto fail; } /* The unanchored end loop state, and an end state with no outgoing edges. */ - fsm_state_t global_end_loop, global_end; + fsm_state_t global_unanchored_end_loop, global_end; if (!fsm_addstate(res, &global_end)) { goto fail; } - if (!fsm_addstate(res, &global_end_loop)) { goto fail; } + if (!fsm_addstate(res, &global_unanchored_end_loop)) { goto fail; } - /* link the start to the start loop and anchored start, and the start loop to itself */ + /* link the start to the global unanchored start loop and anchored start. */ if (log) { - fprintf(stderr, "link_before: global_start %d -> global_start_loop %d and global_start_anchored %d\n", - global_start, global_start_loop, global_start_anchored); + fprintf(stderr, "link_before: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", + global_start, global_unanchored_start_loop, global_anchored_start); } - if (!fsm_addedge_epsilon(res, global_start, global_start_loop)) { goto fail; } - if (!fsm_addedge_epsilon(res, global_start, global_start_anchored)) { goto fail; } - if (!fsm_addedge_any(res, global_start_loop, global_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_start, global_unanchored_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_start, global_anchored_start)) { goto fail; } - /* link the end loop and end */ + /* Link the global unanchored start loop to itself. */ + if (!fsm_addedge_any(res, global_unanchored_start_loop, global_unanchored_start_loop)) { goto fail; } + + /* Link the global unanchored end loop and global end. */ if (log) { - fprintf(stderr, "link_before: global_end_loop %d -> global_end %d (and -> self)\n", global_end_loop, global_end); + fprintf(stderr, "link_before: global_unanchored_end_loop %d -> global_end %d (and -> self)\n", global_unanchored_end_loop, global_end); } - if (!fsm_addedge_epsilon(res, global_end_loop, global_end)) { goto fail; } - if (!fsm_addedge_any(res, global_end_loop, global_end_loop)) { goto fail; } + if (!fsm_addedge_any(res, global_unanchored_end_loop, global_unanchored_end_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } if (bases != NULL) { memset(bases, 0x00, entry_count * sizeof(bases[0])); } + /* For each group FSM, link its unanchored and anchored start states + * and eager_match_state to the global ones. */ for (size_t fsm_i = 0; fsm_i < entry_count; fsm_i++) { - ends.used = 0; /* reset */ - struct fsm *fsm = entries[fsm_i].fsm; entries[fsm_i].fsm = NULL; /* transfer ownership */ const size_t state_count = fsm_countstates(fsm); - - fsm_state_t fsm_start; - if (!fsm_getstart(fsm, &fsm_start)) { + struct analysis_info *ainfo = &ainfos[fsm_i]; + if (ainfo->start == NO_STATE) { fsm_free(fsm); /* no start, just discard */ continue; } + assert(ainfo->start < state_count); - for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { - if (fsm_isend(fsm, s_i)) { - if (ends.used == ends.ceil) { /* grow? */ - size_t nceil = (ends.ceil == 0 ? 4 : 2*ends.ceil); - fsm_state_t *nstates = f_realloc(alloc, - ends.states, nceil * sizeof(nstates[0])); - if (nstates == NULL) { goto fail; } - ends.ceil = nceil; - ends.states = nstates; - } - ends.states[ends.used++] = s_i; - } - } - - if (ends.used == 0) { - fsm_free(fsm); /* no ends, just discard */ - continue; - } - - /* When combining these, remove self-edges from any states on the FSMs to be - * combined that also have eager output IDs. We are about to add an epsilon edge - * from each to a shared state that won't have eager output IDs. - * - * Eager output matching should be idempotent, so carrying it to other reachable - * state is redundant, and it leads to a combinatorial explosion that blows up the - * state count while determinising the combined FSM otherwise. - * - * For example, if /aaa/, /bbb/, and /ccc/ are combined into a DFA that repeats - * the sub-patterns (like `^.*(?:(aaa)|(bbb)|(ccc))+.*$`), the self-edge at each - * eager output state would combine with every reachable state from then on, - * leading to a copy of the whole reachable subgraph colored by every - * combination of eager output IDs: aaa, bbb, ccc, aaa+bbb, aaa+ccc, - * bbb+ccc, aaa+bbb+ccc. Instead of three relatively separate subgraphs - * that set the eager output at their last state, one for each pattern, - * it leads to 8 (2**3) subgraph clusters because it encodes _each - * distinct combination_ in the DFA. This becomes incredibly expensive - * as the combined pattern count increases; it's essentially what I'm - * trying to avoid by adding eager output support in the first place. - * - * FIXME: instead of actively removing these, filter in fsm_determinise? */ - if (fsm_eager_output_has_eager_output(fsm)) { - /* for any state that has eager outputs and a self edge, - * remove the self edge before further linkage */ - for (fsm_state_t s = 0; s < fsm->statecount; s++) { - const size_t eager_output_count = fsm_eager_output_count(fsm, s); - if (eager_output_count == 0) { continue; } - struct edge_set *edges = fsm->states[s].edges; - struct edge_set *new = edge_set_new(); - - struct edge_group_iter iter; - struct edge_group_iter_info info; - edge_set_group_iter_reset(edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - if (info.to != s) { - if (!edge_set_add_bulk(&new, fsm->alloc, - info.symbols, info.to)) { - goto fail; - } - } - } - edge_set_free(fsm->alloc, edges); - fsm->states[s].edges = new; - } - } - - /* call fsm_merge; we really don't care which is which */ + /* Call fsm_merge; we really don't care which is which. */ struct fsm_combine_info combine_info; struct fsm *merged = fsm_merge(res, fsm, &combine_info); if (merged == NULL) { goto fail; } - /* update offsets if res had its state IDs shifted forward */ + /* Update offsets if res had its state IDs shifted forward. */ global_start += combine_info.base_a; - global_start_loop += combine_info.base_a; - global_start_anchored += combine_info.base_a;; + global_unanchored_start_loop += combine_info.base_a; + global_anchored_start += combine_info.base_a; global_end += combine_info.base_a; - global_end_loop += combine_info.base_a; + global_unanchored_end_loop += combine_info.base_a; - /* also update offsets for the FSM's states */ - fsm_start += combine_info.base_b; - for (size_t i = 0; i < ends.used; i++) { - ends.states[i] += combine_info.base_b; - } + /* Also update offsets for the group FSM's states. */ + rebase_analysis_info(ainfo, combine_info.base_b); if (bases != NULL) { bases[fsm_i].state = combine_info.base_b; bases[fsm_i].capture = combine_info.capture_base_b; } - if (log) { - fprintf(stderr, "%s: fsm[%zd].start: %d\n", __func__, fsm_i, fsm_start); - for (size_t i = 0; i < ends.used; i++) { - fprintf(stderr, "%s: fsm[%zd].ends[%zd]: %d\n", __func__, fsm_i, i, ends.states[i]); + /* Link the FSM's eager match state back to the global_unanchored_end_loop, so that after + * matching it in an unanchored way it can continue attempting to match other combined + * patterns that aren't anchored at their start. Also link it to the global end, so + * it will be retained during determinisation and minimisation. */ + if (ainfo->eager_match_state != NO_STATE) { + if (!fsm_addedge_epsilon(merged, ainfo->eager_match_state, global_unanchored_end_loop)) { + goto fail; + } + if (!fsm_addedge_epsilon(merged, ainfo->eager_match_state, global_end)) { + goto fail; + } + + /* If the NFA matches the empty string and is not anchored at the end, then + * add an epsilon edge from the global start directly to its eager match state. + * This ensures all inputs will match this group NFA when combined. */ + if (ainfo->nullable) { + assert(ainfo->unanchored_end_loop != NO_STATE); + if (!fsm_addedge_epsilon(merged, global_start, ainfo->eager_match_state)) { + goto fail; + } + } + } else { + /* If the NFA matches an end-anchored empty string, then add an epsilon edge from + * the global start to an anchored end, which has an endid. */ + if (ainfo->nullable && !state_set_empty(ainfo->anchored_ends)) { + struct state_iter si; + state_set_reset(ainfo->anchored_ends, &si); + fsm_state_t anchored_end_state; + + while (state_set_next(&si, &anchored_end_state)) { + if (!fsm_addedge_epsilon(merged, global_start, anchored_end_state)) { + goto fail; + } + /* It should only be necessary to link one, since that's enough + * for determinisation to carry the end id back to the start's + * epsilon closure. */ + break; + } } } - /* link to the FSM's start state */ - const fsm_state_t start_src = entries[fsm_i].anchored_start ? global_start_anchored : global_start_loop; - if (!fsm_addedge_epsilon(merged, start_src, fsm_start)) { goto fail; } - if (log) { - fprintf(stderr, "%s: linking %s %d to fsm[%zd]'s start %d (anchored? %d)\n", - __func__, - entries[fsm_i].anchored_start ? "global_start_anchored" : "global_start_loop", - start_src, fsm_i, fsm_start, entries[fsm_i].anchored_start); + struct edge_group_iter iter; + struct edge_group_iter_info info; + + /* Link the global_anchored_start to group FSM paths that are start-anchored + * and can only match at the start of input. */ + edge_set_group_iter_reset(ainfo->anchored_firsts, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + assert(global_anchored_start < merged->statecount); + struct fsm_state *anchored_start = &merged->states[global_anchored_start]; + if (!edge_set_add_bulk(&anchored_start->edges, merged->alloc, + info.symbols, info.to)) { + goto fail; + } } - /* link from the FSM's ends */ - const fsm_state_t end_dst = entries[fsm_i].anchored_end ? global_end : global_end_loop; - for (size_t i = 0; i < ends.used; i++) { - if (log) { - fprintf(stderr, "%s: linking fsm[%zd]'s end[%zd] %d (anchored? %d) to %s %d\n", - __func__, fsm_i, i, ends.states[i], entries[fsm_i].anchored_end, - entries[fsm_i].anchored_end ? "global_end" : "global_end_loop", - end_dst); + /* Link the global_unanchored_start_loop to group FSM paths that aren't + * start-anchored. */ + edge_set_group_iter_reset(ainfo->repeatable_firsts, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + struct fsm_state *unanchored_start = &merged->states[global_unanchored_start_loop]; + if (!edge_set_add_bulk(&unanchored_start->edges, merged->alloc, + info.symbols, info.to)) { + goto fail; } - if (!fsm_addedge_epsilon(merged, ends.states[i], end_dst)) { goto fail; } } res = merged; } - /* Link from the global_end_loop to the global_start_loop, so patterns with an - * unanchored start can follow other patterns with an unanchored end. */ + /* Link from the global_unanchored_end_loop to the global_unanchored_start_loop, + * so patterns with an unanchored start can follow other patterns with an unanchored + * end, possibly with other ignored input between them. */ if (log) { fprintf(stderr, "%s: g_start %d, g_start_loop %d, g_start_anchored %d, g_end_loop %d, g_end %d (after all merging)\n", - __func__, global_start, global_start_loop, global_start_anchored, global_end_loop, global_end); - fprintf(stderr, "%s: linking global_end_loop %d to global_start_loop %d\n", - __func__, global_end_loop, global_start_loop); + __func__, global_start, global_unanchored_start_loop, global_anchored_start, global_unanchored_end_loop, global_end); + fprintf(stderr, "%s: linking global_unanchored_end_loop %d to global_unanchored_start_loop %d\n", + __func__, global_unanchored_end_loop, global_unanchored_start_loop); fprintf(stderr, "%s: setting global_start %d and end %d\n", __func__, global_start, global_end); } - if (!fsm_addedge_epsilon(res, global_end_loop, global_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_unanchored_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } /* This needs to be set after merging, because that clears the start state. */ fsm_setstart(res, global_start); fsm_setend(res, global_end, 1); - f_free(alloc, ends.states); + for (size_t i = 0; i < entry_count; i++) { + free_analysis(alloc, &ainfos[i]); + } + + f_free(alloc, ainfos); + return res; fail: - f_free(alloc, ends.states); + if (ainfos != NULL) { + for (size_t i = 0; i < entry_count; i++) { + free_analysis(alloc, &ainfos[i]); + } + f_free(alloc, ainfos); + } + return NULL; } diff --git a/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c b/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c new file mode 100644 index 000000000..06965e014 --- /dev/null +++ b/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c @@ -0,0 +1,29 @@ +#include "utils.h" + +/* Test for false positive matches when combining patterns + * with both anchored and unanchored subtrees. */ + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "a(?:x|$)", + "a(?:y|$)", + }, + .inputs = { + { .input = "a", .expected_ids = { 1, 2 } }, + { .input = "aZ", .expect_fail = true }, + { .input = "Za", .expected_ids = { 1, 2 } }, + { .input = "ax", .expected_ids = { 1 } }, + { .input = "axZ", .expected_ids = { 1 } }, + { .input = "ay", .expected_ids = { 2 } }, + { .input = "ayZ", .expected_ids = { 2 } }, + { .input = "axa", .expected_ids = { 1, 2 } }, + { .input = "aya", .expected_ids = { 1, 2 } }, + { .input = "axay", .expected_ids = { 1, 2 } }, + { .input = "ayax", .expected_ids = { 1, 2 } }, + }, + }; + + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_at_start_multiple_anchored.c b/tests/eager_output/eager_output_at_start_multiple_anchored.c new file mode 100644 index 000000000..ce242472f --- /dev/null +++ b/tests/eager_output/eager_output_at_start_multiple_anchored.c @@ -0,0 +1,15 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + [0] = "$", + [1] = "^$", + }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test, false, false); +} diff --git a/tests/eager_output/eager_output_at_start_single_anchored.c b/tests/eager_output/eager_output_at_start_single_anchored.c new file mode 100644 index 000000000..e7ce16050 --- /dev/null +++ b/tests/eager_output/eager_output_at_start_single_anchored.c @@ -0,0 +1,22 @@ +#include "utils.h" + +int main(void) +{ + const struct eager_output_test test_unanchored_start = { + .patterns = { "$" }, + .inputs = { + { .input = "", .expected_ids = { 1 } }, + }, + }; + + const struct eager_output_test test_anchored_start = { + .patterns = { "^$" }, + .inputs = { + { .input = "", .expected_ids = { 1 } }, + }, + }; + + bool pass = run_test(&test_unanchored_start, false, false); + pass = run_test(&test_anchored_start, false, false) && pass; + return pass; +} diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c index 4bee8d848..f7eb23069 100644 --- a/tests/eager_output/utils.c +++ b/tests/eager_output/utils.c @@ -45,6 +45,8 @@ dump(const struct fsm *fsm) int run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids) { + (void)force_endids; /* TODO: unused, remove. */ + struct fsm_union_entry entries[MAX_PATTERNS] = {0}; allow_extra_outputs = false; @@ -82,45 +84,14 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); assert(fsm != NULL); - /* Zero is used to terminate expected_ids, so don't use it here. */ - const fsm_output_id_t output_id = (fsm_output_id_t) (i + 1); - const fsm_end_id_t end_id = (fsm_end_id_t) (i + 1); - - /* Set either an end ID or an eager output ID, depending on - * whether the fsm is anchored at the end or not. */ - if (e->anchored_end || force_endids) { - ret = fsm_setendid(fsm, end_id); - } else { - ret = fsm_seteageroutputonends(fsm, output_id); - } - assert(ret == 1); - if (log) { fprintf(stderr, "==== source DFA %zd (pre det+min)\n", i); - if (log > 1) { dump(fsm); } - fsm_eager_output_dump(stderr, fsm); - fsm_endid_dump(stderr, fsm); - fprintf(stderr, "====\n"); - } - - ret = fsm_determinise(fsm); - assert(ret == 1); - - if (log) { - fprintf(stderr, "==== source DFA %zd (post det)\n", i); - if (log > 1) { dump(fsm); } - fsm_eager_output_dump(stderr, fsm); - fprintf(stderr, "====\n"); - } - - ret = fsm_minimise(fsm); - assert(ret == 1); - - if (log) { - fprintf(stderr, "==== source DFA %zd (post det+min)\n", i); - if (log > 1) { dump(fsm); } - fsm_eager_output_dump(stderr, fsm); - fprintf(stderr, "====\n"); + if (log > 1) { + dump(fsm); + fsm_eager_output_dump(stderr, fsm); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } } e->fsm = fsm; @@ -133,11 +104,13 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo if (log) { fprintf(stderr, "==== combined (pre det+min)\n"); - if (log > 1) { dump(fsm); } - fsm_eager_output_dump(stderr, fsm); - fprintf(stderr, "--- endids:\n"); - fsm_endid_dump(stderr, fsm); - fprintf(stderr, "====\n"); + if (log > 1) { + dump(fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } } if (log) { @@ -151,9 +124,11 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo if (log) { fprintf(stderr, "==== combined (post det)\n"); - if (log > 1) { dump(fsm); } - fsm_eager_output_dump(stderr, fsm); - fprintf(stderr, "====\n"); + if (log > 1) { + dump(fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } } ret = fsm_minimise(fsm); @@ -164,11 +139,13 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo if (log) { fprintf(stderr, "==== combined (post det+min)\n"); - if (log > 1) { dump(fsm); } - fsm_eager_output_dump(stderr, fsm); - fprintf(stderr, "--- endids:\n"); - fsm_endid_dump(stderr, fsm); - fprintf(stderr, "====\n"); + if (log > 1) { + dump(fsm); + fsm_eager_output_dump(stderr, fsm); + fprintf(stderr, "--- endids:\n"); + fsm_endid_dump(stderr, fsm); + fprintf(stderr, "====\n"); + } } struct cb_info outputs = { 0 }; @@ -218,13 +195,22 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo /* Copy endid outputs into outputs.ids[], since for testing * purposes we don't care about the difference between eager - * output and endids here -- the values don't overlap. */ + * output and endids here. */ assert(outputs.used + endid_count <= MAX_IDS); for (size_t endid_i = 0; endid_i < endid_count; endid_i++) { if (log) { fprintf(stderr, "-- adding endid %zd: %d\n", endid_i, endid_buf[endid_i]); } - outputs.ids[outputs.used++] = (fsm_output_id_t)endid_buf[endid_i]; + bool found = false; + for (size_t o_i = 0; o_i < outputs.used; o_i++) { + if (outputs.ids[o_i] == endid_buf[endid_i]) { + found = true; + break; + } + } + if (!found) { + outputs.ids[outputs.used++] = (fsm_output_id_t)endid_buf[endid_i]; + } } } From af27a87c1d078465a8e22dda00a61c0dc00b26e7 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 4 Feb 2025 13:57:35 -0500 Subject: [PATCH 38/80] fsm_union_repeated_pattern_group: Interface changes. - Instead of taking an array of `struct fsm_union_array *` pointers, this now takes an array of `struct fsm *` pointers. The other fields on `fsm_union_array` are no longer used, so the extra struct layer has been removed. - This now takes an extra argument, id_base, because each nfa[i] will get end IDs and/or output IDs (i + id_base) set on them. Previously these were set by the caller. - Rename parameters, to emphasize that the FSMs must be NFAs. - Update the test code for the interface changes. - Remove flags from the test code that are no longer used. --- include/fsm/bool.h | 23 ++++++---- src/libfsm/union.c | 43 +++++++++---------- tests/eager_output/eager_output1.c | 2 +- tests/eager_output/eager_output2.c | 2 +- tests/eager_output/eager_output3.c | 2 +- tests/eager_output/eager_output4.c | 2 +- tests/eager_output/eager_output5.c | 2 +- tests/eager_output/eager_output6.c | 2 +- tests/eager_output/eager_output7.c | 30 +------------ ...utput_alt_mixing_anchored_and_unanchored.c | 2 +- tests/eager_output/eager_output_at_start.c | 2 +- .../eager_output_at_start_multiple.c | 15 +++++++ .../eager_output_at_start_multiple_anchored.c | 2 +- .../eager_output_at_start_single_anchored.c | 4 +- tests/eager_output/eager_output_fr1.c | 2 +- tests/eager_output/eager_output_fr2.c | 2 +- tests/eager_output/eager_output_fr3.c | 2 +- .../eager_output_mixed_anchored_unanchored.c | 2 +- tests/eager_output/utils.c | 38 ++++------------ tests/eager_output/utils.h | 2 +- 20 files changed, 76 insertions(+), 105 deletions(-) create mode 100644 tests/eager_output/eager_output_at_start_multiple.c diff --git a/include/fsm/bool.h b/include/fsm/bool.h index 4d9f1889a..c2c2d80ed 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -52,15 +52,22 @@ struct fsm * fsm_union_array(size_t fsm_count, struct fsm **fsms, struct fsm_combined_base_pair *bases); -struct fsm_union_entry { - struct fsm *fsm; - bool anchored_start; - bool anchored_end; -}; - +/* Combine an array of NFAs into a single NFA that attempts to match them + * all in one pass, with an extra loop so that more than one pattern with + * eager outputs can match. Ownership of the NFAs is transferred, they will + * be combined (or freed, if they don't have a start state). + * + * This MUST be called with NFAs constructed via re_comp, Calling it with + * manually constructed NFAs or DFAs is unsupported. + * + * This will set end IDs and/or output IDs representing matching each + * of the original NFAs on the combined result, where nfas[i] will + * get ID of (id_base + i). + * + * Returns NULL on error. */ struct fsm * -fsm_union_repeated_pattern_group(size_t entry_count, - struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases); +fsm_union_repeated_pattern_group(size_t nfa_count, + struct fsm **nfas, struct fsm_combined_base_pair *bases, size_t id_base); struct fsm * fsm_intersect(struct fsm *a, struct fsm *b); diff --git a/src/libfsm/union.c b/src/libfsm/union.c index ed71384ff..f381d7cd6 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -638,31 +638,28 @@ free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) * all in one pass, with an extra loop so that more than one pattern with * eager outputs can match. */ struct fsm * -fsm_union_repeated_pattern_group(size_t entry_count, - struct fsm_union_entry *entries, struct fsm_combined_base_pair *bases) +fsm_union_repeated_pattern_group(size_t nfa_count, + struct fsm **nfas, struct fsm_combined_base_pair *bases, size_t id_base) { - const struct fsm_alloc *alloc = entries[0].fsm->alloc; + const struct fsm_alloc *alloc = nfas[0]->alloc; const bool log = 0 || LOG_UNION_REPEATED_PATTERN_GROUP; - /* TODO: make this an extra argument */ - const size_t id_base = 1; - - struct analysis_info *ainfos = f_calloc(alloc, entry_count, sizeof(ainfos[0])); + struct analysis_info *ainfos = f_calloc(alloc, nfa_count, sizeof(ainfos[0])); if (ainfos == NULL) { goto fail; } size_t est_total_states = 0; - for (size_t i = 0; i < entry_count; i++) { - assert(entries[i].fsm); - if (entries[i].fsm->alloc != alloc) { + for (size_t i = 0; i < nfa_count; i++) { + assert(nfas[i]); + if (nfas[i]->alloc != alloc) { errno = EINVAL; return NULL; } - const size_t count = fsm_countstates(entries[i].fsm); + const size_t count = fsm_countstates(nfas[i]); est_total_states += count; } - for (size_t i = 0; i < entry_count; i++) { - struct fsm *fsm = entries[i].fsm; + for (size_t i = 0; i < nfa_count; i++) { + struct fsm *fsm = nfas[i]; /* Identify various states in the NFA that will be relevant to combining. */ if (!analyze_group_nfa(fsm, &ainfos[i])) { @@ -716,17 +713,17 @@ fsm_union_repeated_pattern_group(size_t entry_count, if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } if (bases != NULL) { - memset(bases, 0x00, entry_count * sizeof(bases[0])); + memset(bases, 0x00, nfa_count * sizeof(bases[0])); } - /* For each group FSM, link its unanchored and anchored start states + /* For each group NFA, link its unanchored and anchored start states * and eager_match_state to the global ones. */ - for (size_t fsm_i = 0; fsm_i < entry_count; fsm_i++) { - struct fsm *fsm = entries[fsm_i].fsm; - entries[fsm_i].fsm = NULL; /* transfer ownership */ + for (size_t nfa_i = 0; nfa_i < nfa_count; nfa_i++) { + struct fsm *fsm = nfas[nfa_i]; + nfas[nfa_i] = NULL; /* transfer ownership */ const size_t state_count = fsm_countstates(fsm); - struct analysis_info *ainfo = &ainfos[fsm_i]; + struct analysis_info *ainfo = &ainfos[nfa_i]; if (ainfo->start == NO_STATE) { fsm_free(fsm); /* no start, just discard */ continue; @@ -749,8 +746,8 @@ fsm_union_repeated_pattern_group(size_t entry_count, rebase_analysis_info(ainfo, combine_info.base_b); if (bases != NULL) { - bases[fsm_i].state = combine_info.base_b; - bases[fsm_i].capture = combine_info.capture_base_b; + bases[nfa_i].state = combine_info.base_b; + bases[nfa_i].capture = combine_info.capture_base_b; } /* Link the FSM's eager match state back to the global_unanchored_end_loop, so that after @@ -840,7 +837,7 @@ fsm_union_repeated_pattern_group(size_t entry_count, fsm_setstart(res, global_start); fsm_setend(res, global_end, 1); - for (size_t i = 0; i < entry_count; i++) { + for (size_t i = 0; i < nfa_count; i++) { free_analysis(alloc, &ainfos[i]); } @@ -850,7 +847,7 @@ fsm_union_repeated_pattern_group(size_t entry_count, fail: if (ainfos != NULL) { - for (size_t i = 0; i < entry_count; i++) { + for (size_t i = 0; i < nfa_count; i++) { free_analysis(alloc, &ainfos[i]); } f_free(alloc, ainfos); diff --git a/tests/eager_output/eager_output1.c b/tests/eager_output/eager_output1.c index f20ef77b7..4900f89e0 100644 --- a/tests/eager_output/eager_output1.c +++ b/tests/eager_output/eager_output1.c @@ -8,5 +8,5 @@ int main(void) { .input = "abc", .expected_ids = { 1 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output2.c b/tests/eager_output/eager_output2.c index cdac204e2..6a10eec1c 100644 --- a/tests/eager_output/eager_output2.c +++ b/tests/eager_output/eager_output2.c @@ -13,5 +13,5 @@ int main(void) { .input = "XabeX", .expected_ids = { 1 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output3.c b/tests/eager_output/eager_output3.c index c11bc58a4..b6320ef79 100644 --- a/tests/eager_output/eager_output3.c +++ b/tests/eager_output/eager_output3.c @@ -12,5 +12,5 @@ int main(void) { .input = "abe", .expected_ids = { 1 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output4.c b/tests/eager_output/eager_output4.c index 47cd32029..2e0f17f13 100644 --- a/tests/eager_output/eager_output4.c +++ b/tests/eager_output/eager_output4.c @@ -9,5 +9,5 @@ int main(void) { .input = "Xabcde", .expected_ids = { 1 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output5.c b/tests/eager_output/eager_output5.c index 4551c68b1..6d2ce4eb8 100644 --- a/tests/eager_output/eager_output5.c +++ b/tests/eager_output/eager_output5.c @@ -10,5 +10,5 @@ int main(void) { .input = "abbc", .expected_ids = { 2 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output6.c b/tests/eager_output/eager_output6.c index 5431d0981..188541f39 100644 --- a/tests/eager_output/eager_output6.c +++ b/tests/eager_output/eager_output6.c @@ -30,5 +30,5 @@ int main(void) }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output7.c b/tests/eager_output/eager_output7.c index 3d123878b..94e9f1787 100644 --- a/tests/eager_output/eager_output7.c +++ b/tests/eager_output/eager_output7.c @@ -2,24 +2,6 @@ int main(void) { - /* Run this test with env FORCE_ENDIDS=N ... to see how much more - * expensive it is to combine the first N patterns using endids, - * rather than eager_outputs. It becomes VERY slow for >= 9 or so. - * (Note that the checks probably will not pass for N < 4, because - * it will start skipping appear in the early test inputs.) */ - bool force_endids = false; - size_t force_endid_count = 0; - { - const char *str = getenv("FORCE_ENDIDS"); - if (str != NULL) { - force_endid_count = atoi(str); - if (force_endid_count == 0) { - force_endid_count = 26; - } - force_endids = true; - } - } - struct eager_output_test test = { .patterns = { [0] = "apple", @@ -89,15 +71,5 @@ int main(void) }, }; - /* truncate patterns to the first N */ - if (force_endids) { - assert(force_endid_count > 0 && force_endid_count <= 26); - test.patterns[force_endid_count] = NULL; - - /* truncate test inputs to just the first couple, since - * later inputs use later patterns */ - test.inputs[5].input = NULL; - } - - return run_test(&test, false, force_endids); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c b/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c index 06965e014..ad125b9b1 100644 --- a/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c +++ b/tests/eager_output/eager_output_alt_mixing_anchored_and_unanchored.c @@ -25,5 +25,5 @@ int main(void) }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_at_start.c b/tests/eager_output/eager_output_at_start.c index 407aa4e77..8ba5f2ad1 100644 --- a/tests/eager_output/eager_output_at_start.c +++ b/tests/eager_output/eager_output_at_start.c @@ -8,5 +8,5 @@ int main(void) { .input = "", .expected_ids = { 1 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_at_start_multiple.c b/tests/eager_output/eager_output_at_start_multiple.c new file mode 100644 index 000000000..ddc9530f5 --- /dev/null +++ b/tests/eager_output/eager_output_at_start_multiple.c @@ -0,0 +1,15 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + [0] = "", + [1] = "", + }, + .inputs = { + { .input = "", .expected_ids = { 1, 2 } }, + }, + }; + return run_test(&test); +} diff --git a/tests/eager_output/eager_output_at_start_multiple_anchored.c b/tests/eager_output/eager_output_at_start_multiple_anchored.c index ce242472f..0cf7e9b70 100644 --- a/tests/eager_output/eager_output_at_start_multiple_anchored.c +++ b/tests/eager_output/eager_output_at_start_multiple_anchored.c @@ -11,5 +11,5 @@ int main(void) { .input = "", .expected_ids = { 1, 2 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_at_start_single_anchored.c b/tests/eager_output/eager_output_at_start_single_anchored.c index e7ce16050..a9c13ef67 100644 --- a/tests/eager_output/eager_output_at_start_single_anchored.c +++ b/tests/eager_output/eager_output_at_start_single_anchored.c @@ -16,7 +16,7 @@ int main(void) }, }; - bool pass = run_test(&test_unanchored_start, false, false); - pass = run_test(&test_anchored_start, false, false) && pass; + bool pass = run_test(&test_unanchored_start); + pass = run_test(&test_anchored_start) && pass; return pass; } diff --git a/tests/eager_output/eager_output_fr1.c b/tests/eager_output/eager_output_fr1.c index e8e5f3395..97eb34312 100644 --- a/tests/eager_output/eager_output_fr1.c +++ b/tests/eager_output/eager_output_fr1.c @@ -9,5 +9,5 @@ int main(void) { .input = "ab", .expected_ids = { 1, 2 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_fr2.c b/tests/eager_output/eager_output_fr2.c index 404e98644..23bd3103c 100644 --- a/tests/eager_output/eager_output_fr2.c +++ b/tests/eager_output/eager_output_fr2.c @@ -9,5 +9,5 @@ int main(void) { .input = "", .expected_ids = { 1, 2 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_fr3.c b/tests/eager_output/eager_output_fr3.c index c7e4127a6..0d15a4a68 100644 --- a/tests/eager_output/eager_output_fr3.c +++ b/tests/eager_output/eager_output_fr3.c @@ -9,5 +9,5 @@ int main(void) { .input = "", .expected_ids = { 1, 2 } }, }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/eager_output_mixed_anchored_unanchored.c b/tests/eager_output/eager_output_mixed_anchored_unanchored.c index a586f9840..7afb272db 100644 --- a/tests/eager_output/eager_output_mixed_anchored_unanchored.c +++ b/tests/eager_output/eager_output_mixed_anchored_unanchored.c @@ -42,5 +42,5 @@ int main(void) }, }; - return run_test(&test, false, false); + return run_test(&test); } diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c index f7eb23069..dfd2b952b 100644 --- a/tests/eager_output/utils.c +++ b/tests/eager_output/utils.c @@ -43,15 +43,11 @@ dump(const struct fsm *fsm) } int -run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids) +run_test(const struct eager_output_test *test) { - (void)force_endids; /* TODO: unused, remove. */ + struct fsm *nfas[MAX_PATTERNS] = {0}; - struct fsm_union_entry entries[MAX_PATTERNS] = {0}; - - allow_extra_outputs = false; - - size_t fsms_used = 0; + size_t nfas_used = 0; int ret = 0; int log = 0; @@ -68,24 +64,12 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo for (size_t i = 0; i < MAX_PATTERNS; i++) { const char *p = test->patterns[i]; if (test->patterns[i] == NULL) { break; } - const size_t len = strlen(p); - struct fsm_union_entry *e = &entries[fsms_used]; - - /* For sake of these patterns, they are anchored if the first/last - * character is '^' and '$', respectively. This is too simplistic - * for the general case, though. */ - if (len > 0) { - if (p[0] == '^') { e->anchored_start = true; } - if (p[len - 1] == '$') { e->anchored_end = true; } - /* fprintf(stderr, "%s: p[%zd]: '%s', start %d, end %d\n", */ - /* __func__, fsms_used, p, e->anchored_start, e->anchored_end); */ - } struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); assert(fsm != NULL); if (log) { - fprintf(stderr, "==== source DFA %zd (pre det+min)\n", i); + fprintf(stderr, "==== source NFA %zd\n", i); if (log > 1) { dump(fsm); fsm_eager_output_dump(stderr, fsm); @@ -94,12 +78,12 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo } } - e->fsm = fsm; - fsms_used++; + nfas[i] = fsm; + nfas_used++; } - /* If there's only one pattern this just returns fsms[0]. */ - struct fsm *fsm = fsm_union_repeated_pattern_group(fsms_used, entries, NULL); + const size_t id_base = 1; /* offset by 1 because 0 is used as end-of-list */ + struct fsm *fsm = fsm_union_repeated_pattern_group(nfas_used, nfas, NULL, id_base); assert(fsm != NULL); if (log) { @@ -238,11 +222,7 @@ run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool fo assert(ret == 1); } - if (!allow_extra_outputs) { - assert(outputs.used == expected_id_count); - } else { - assert(outputs.used >= expected_id_count); - } + assert(outputs.used >= expected_id_count); size_t floor = 0; for (size_t exp_i = 0; exp_i < outputs.used; exp_i++) { diff --git a/tests/eager_output/utils.h b/tests/eager_output/utils.h index 672c01977..02f8427c9 100644 --- a/tests/eager_output/utils.h +++ b/tests/eager_output/utils.h @@ -48,7 +48,7 @@ int cmp_output(const void *pa, const void *pb); int -run_test(const struct eager_output_test *test, bool allow_extra_outputs, bool force_endids); +run_test(const struct eager_output_test *test); struct cb_info { size_t used; From d44a6710caed008004d8ea752211ef366fa7917b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 5 Feb 2025 15:35:44 -0500 Subject: [PATCH 39/80] fsm_union_repeated_pattern_group: fix linkage for mixed start anchoring. --- src/libfsm/union.c | 64 ++++++++++++++++--- ...tput_mixed_anchored_and_unanchored_start.c | 18 ++++++ 2 files changed, 73 insertions(+), 9 deletions(-) create mode 100644 tests/eager_output/eager_output_mixed_anchored_and_unanchored_start.c diff --git a/src/libfsm/union.c b/src/libfsm/union.c index f381d7cd6..e3c7311ae 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -357,6 +357,56 @@ start_state_epsilon_closure_matches_empty_string(const struct fsm *nfa, fsm_stat return true; } +static bool +copy_noncycle_labeled_edges_from_epsilon_closure_iter(struct analysis_info *ainfo, + const struct fsm *nfa, fsm_state_t usl_i, fsm_state_t s_i, struct state_set **seen) +{ + const struct fsm_state *s = &nfa->states[s_i]; + + struct edge_group_iter egi; + struct edge_group_iter_info info; + + /* copy its labeled edges to ainfo->repeatable_firsts, + * unless they lead back to the unanchored_start_loop */ + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + if (info.to != usl_i) { + if (!edge_set_add_bulk(&ainfo->repeatable_firsts, + nfa->alloc, info.symbols, info.to)) { + return false; + } + } + } + + if (!state_set_add(seen, nfa->alloc, usl_i)) { return false; } + + struct state_iter si; + state_set_reset(s->epsilons, &si); + fsm_state_t ns_i; + while (state_set_next(&si, &ns_i)) { + if (state_set_contains(*seen, ns_i)) { continue; } + if (!copy_noncycle_labeled_edges_from_epsilon_closure_iter(ainfo, nfa, usl_i, ns_i, seen)) { + return false; + } + } + + return true; +} + +static bool +copy_noncycle_labeled_edges_from_epsilon_closure(struct analysis_info *ainfo, + const struct fsm *nfa, fsm_state_t unanchored_start_loop_id) +{ + struct state_set *seen = NULL; /* empty set */ + if (!copy_noncycle_labeled_edges_from_epsilon_closure_iter(ainfo, nfa, + unanchored_start_loop_id, unanchored_start_loop_id, &seen)) { + return false; + } + state_set_free(seen); + + return true; +} + static bool analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) { @@ -396,15 +446,11 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) assert(ainfo->unanchored_start_loop == NO_STATE); ainfo->unanchored_start_loop = ns_i; - /* copy its non-self labeled edges to ainfo->repeatable_firsts */ - edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &egi); - while (edge_set_group_iter_next(&egi, &info)) { - if (info.to != ns_i) { - if (!edge_set_add_bulk(&ainfo->repeatable_firsts, - nfa->alloc, info.symbols, info.to)) { - goto alloc_fail; - } - } + /* Copy labeled edges from the unanchored start loop and its epsilon + * closure to ainfo->repeatable_firsts, except for edges leading back + * to the unanchored start loop. */ + if (!copy_noncycle_labeled_edges_from_epsilon_closure(ainfo, nfa, ns_i)) { + goto alloc_fail; } } else { /* likewise, a state without a dot self-edge is the anchored start */ diff --git a/tests/eager_output/eager_output_mixed_anchored_and_unanchored_start.c b/tests/eager_output/eager_output_mixed_anchored_and_unanchored_start.c new file mode 100644 index 000000000..d18a67de7 --- /dev/null +++ b/tests/eager_output/eager_output_mixed_anchored_and_unanchored_start.c @@ -0,0 +1,18 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "(^|[^A-Z])abc", + }, + .inputs = { + { .input = "abc", .expected_ids = { 1 } }, + { .input = "xabc", .expected_ids = { 1 } }, + { .input = "xyz abc", .expected_ids = { 1 } }, + { .input = "Xabc", .expect_fail = true }, + }, + }; + + return run_test(&test); +} From 36129d079d3e52546fcb7e2020118d0c893ec09c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 12 Feb 2025 11:38:10 -0500 Subject: [PATCH 40/80] Add tests, fix anchoring bugs in fsm_union_repeated_pattern_group. There are two bugs captured in eager_output_unanchored_end_plus.c: - Regexes ending in '+' weren't combining correctly, because analysis wasn't properly handling the construction for matching but optionally repeating the last character. - Eager matching after consuming a single character from the start state wasn't linked correctly to the global_unanchored_start_loop, so while the labeled edges were copied the eager output was lost. The other test files are focused on variants of that -- the + and start cases individually, and when + precedes a `()` subtree with more than one character. --- src/libfsm/union.c | 474 ++++++++++++------ .../eager_output_unanchored_end_plus.c | 19 + .../eager_output_unanchored_end_plus_min.c | 18 + .../eager_output_unanchored_end_plus_min2.c | 19 + ...ger_output_unanchored_end_plus_multichar.c | 19 + 5 files changed, 394 insertions(+), 155 deletions(-) create mode 100644 tests/eager_output/eager_output_unanchored_end_plus.c create mode 100644 tests/eager_output/eager_output_unanchored_end_plus_min.c create mode 100644 tests/eager_output/eager_output_unanchored_end_plus_min2.c create mode 100644 tests/eager_output/eager_output_unanchored_end_plus_multichar.c diff --git a/src/libfsm/union.c b/src/libfsm/union.c index e3c7311ae..9acdddf01 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -27,8 +27,11 @@ #include "endids.h" #define LOG_UNION_ARRAY 0 -#define LOG_ANALYZE_GROUP_NFA_RESULTS 0 +#define LOG_ANALYZE_GROUP_NFA 0 +#define LOG_AFTER_MODIFY_GROUP_NFA 0 +#define LOG_ANALYZE_GROUP_NFA_RESULTS (0 || LOG_ANALYZE_GROUP_NFA > 1) #define LOG_UNION_REPEATED_PATTERN_GROUP 0 +#define LOG_FSM_UNION_REPEATED_PATTERN_GROUP_OUTPUT 0 #define NO_STATE (fsm_state_t)-1 @@ -68,6 +71,9 @@ struct analysis_info { * epsilon edge to the combined NFA's global_unanchored_end_loop. * The old unanchored_end_loop will be disconnected. */ fsm_state_t eager_match_state; + + /* These states need an epsilon edge added to the eager_matched_state. */ + struct state_set *needs_indirect_epsilon_edge_to_eager_match_state; }; struct fsm * @@ -253,37 +259,48 @@ dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set } #endif -/* If there's a labeled edge to an end state, check if the label set is - * only [\n] and there's also an epsilon edge to the same end state. - * This represents an anchored end in the NFA. */ +/* For each state in the epsilon closure, if there's a labeled edge + * to an end state, check if the label set is only [\n] and there's + * also an epsilon edge to the same end state. + * If so, this represents an anchored end in the NFA. */ static bool -has_epsilon_and_newline_edges_to_end(const struct fsm *nfa, fsm_state_t s_i, fsm_state_t *dst_end) +has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, struct state_set *eclosure, + fsm_state_t s_i, fsm_state_t *dst_end) { - assert(s_i < nfa->statecount); - const struct fsm_state *s = &nfa->states[s_i]; + struct state_iter si; + state_set_reset(eclosure, &si); + fsm_state_t ns_i; + while (state_set_next(&si, &ns_i)) { + assert(ns_i < nfa->statecount); + const struct fsm_state *ns = &nfa->states[ns_i]; - if (state_set_empty(s->epsilons)) { return false; } - if (edge_set_empty(s->edges)) { return false; } + if (state_set_empty(ns->epsilons)) { continue; } + if (edge_set_empty(ns->edges)) { continue; } - struct edge_group_iter iter; - struct edge_group_iter_info info; + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + /* Look for an edge set with only '\n' */ + if ((info.symbols[0] != (1ULL << '\n')) + || info.symbols[1] || info.symbols[2] || info.symbols[3]) { + continue; + } - edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - /* Look for an edge set with only '\n' */ - if ((info.symbols[0] != (1ULL << '\n')) - || info.symbols[1] || info.symbols[2] || info.symbols[3]) { - continue; - } + /* If it's an end, look for an epsilon leeding to the same destination */ + if (fsm_isend(nfa, info.to)) { + struct state_iter inner_si; + fsm_state_t os_i; - if (fsm_isend(nfa, info.to)) { - struct state_iter si; - fsm_state_t os_i; - state_set_reset(s->epsilons, &si); - while (state_set_next(&si, &os_i)) { - if (os_i == info.to) { - *dst_end = info.to; - return true; + assert(s_i < nfa->statecount); + const struct fsm_state *s = &nfa->states[s_i]; + + state_set_reset(s->epsilons, &inner_si); + while (state_set_next(&inner_si, &os_i)) { + if (os_i == info.to) { + *dst_end = info.to; + return true; + } } } } @@ -293,123 +310,89 @@ has_epsilon_and_newline_edges_to_end(const struct fsm *nfa, fsm_state_t s_i, fsm } static bool -has_labeled_edge_to_unanchored_end_loop(const struct fsm *nfa, - fsm_state_t s_i, fsm_state_t unanchored_end_loop) +has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa, + struct state_set **eclosures, + fsm_state_t s_i, fsm_state_t unanchored_end_loop, + fsm_state_t *indirect_dst) { if (unanchored_end_loop == NO_STATE) { return false; } assert(unanchored_end_loop < nfa->statecount); - /* The unanchored_end_loop's self-edge doesn't count here. */ - if (s_i == unanchored_end_loop) { return false; } - assert(s_i < nfa->statecount); - const struct fsm_state *s = &nfa->states[s_i]; - struct edge_group_iter iter; - struct edge_group_iter_info info; - edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - if (info.to == unanchored_end_loop) { - return true; - } - } - return false; -} - -static bool -start_state_epsilon_closure_matches_empty_string__iter(const struct fsm *nfa, - fsm_state_t s_i, struct state_set **seen, bool *result) -{ - if (*result) { return true; } - if (!state_set_add(seen, nfa->alloc, s_i)) { return false; } - - if (fsm_isend(nfa, s_i)) { - *result = true; - return true; - } - - assert(s_i < nfa->statecount); - const struct fsm_state *s = &nfa->states[s_i]; + const struct state_set *s_eclosure = eclosures[s_i]; + /* For every state in s_i's epsilon closure, check if it has + * a labeled edge to a state with the unanchored_end_loop + * in its epsilon closure. */ struct state_iter si; - state_set_reset(s->epsilons, &si); + state_set_reset(s_eclosure, &si); fsm_state_t ns_i; while (state_set_next(&si, &ns_i)) { - if (!state_set_contains(*seen, ns_i)) { - if (!start_state_epsilon_closure_matches_empty_string__iter(nfa, ns_i, seen, result)) { - return false; - } - if (*result) { break; } - } - } + /* The unanchored_end_loop's self-edge doesn't count here. */ + if (ns_i == unanchored_end_loop) { continue; } - return true; -} - -/* Does the start state's epsilon closure match the empty string? - * Returns false on error, otherwise returns true and sets *result. */ -static bool -start_state_epsilon_closure_matches_empty_string(const struct fsm *nfa, fsm_state_t start, bool *result) -{ - struct state_set *seen = NULL; /* empty set */ - if (!start_state_epsilon_closure_matches_empty_string__iter(nfa, start, &seen, result)) { return false; } - state_set_free(seen); - - return true; -} - -static bool -copy_noncycle_labeled_edges_from_epsilon_closure_iter(struct analysis_info *ainfo, - const struct fsm *nfa, fsm_state_t usl_i, fsm_state_t s_i, struct state_set **seen) -{ - const struct fsm_state *s = &nfa->states[s_i]; + /* FIXME: this should only apply to the original state, not its epsilon closure...right? */ + if (ns_i != s_i) { continue; } - struct edge_group_iter egi; - struct edge_group_iter_info info; + assert(ns_i < nfa->statecount); + const struct fsm_state *ns = &nfa->states[ns_i]; + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + assert(info.to < nfa->statecount); + const struct state_set *to_eclosure = eclosures[info.to]; + + struct state_iter dst_si; + state_set_reset(to_eclosure, &dst_si); + fsm_state_t dst_s_i; + while (state_set_next(&dst_si, &dst_s_i)) { + if (dst_s_i == unanchored_end_loop) { + if (info.to != unanchored_end_loop) { + *indirect_dst = info.to; + } - /* copy its labeled edges to ainfo->repeatable_firsts, - * unless they lead back to the unanchored_start_loop */ - edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); - while (edge_set_group_iter_next(&egi, &info)) { - if (info.to != usl_i) { - if (!edge_set_add_bulk(&ainfo->repeatable_firsts, - nfa->alloc, info.symbols, info.to)) { - return false; + return true; + } } } } - if (!state_set_add(seen, nfa->alloc, usl_i)) { return false; } - - struct state_iter si; - state_set_reset(s->epsilons, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - if (state_set_contains(*seen, ns_i)) { continue; } - if (!copy_noncycle_labeled_edges_from_epsilon_closure_iter(ainfo, nfa, usl_i, ns_i, seen)) { - return false; - } - } - - return true; + return false; } static bool -copy_noncycle_labeled_edges_from_epsilon_closure(struct analysis_info *ainfo, - const struct fsm *nfa, fsm_state_t unanchored_start_loop_id) +start_state_epsilon_closure_matches_empty_string(const struct fsm *nfa, const struct state_set *eclosure) { - struct state_set *seen = NULL; /* empty set */ - if (!copy_noncycle_labeled_edges_from_epsilon_closure_iter(ainfo, nfa, - unanchored_start_loop_id, unanchored_start_loop_id, &seen)) { - return false; + struct state_iter si; + state_set_reset(eclosure, &si); + + fsm_state_t s_i; + while (state_set_next(&si, &s_i)) { + if (fsm_isend(nfa, s_i)) { return true; } } - state_set_free(seen); - return true; + return false; } +static const struct fsm_options dump_nfa_opt = { + .io = FSM_IO_STR, + .ambig = AMBIG_MULTIPLE, + .case_ranges = 1, + .consolidate_edges = 1, + .group_edges = 1, +}; + static bool analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) { + if (LOG_ANALYZE_GROUP_NFA) { + fprintf(stderr, "==== %s\n", __func__); + if (LOG_ANALYZE_GROUP_NFA > 1) { + fsm_print(stderr, nfa, &dump_nfa_opt, NULL, FSM_PRINT_DOT); + } + } + memset(ainfo, 0x00, sizeof(*ainfo)); ainfo->start = NO_STATE; ainfo->unanchored_start_loop = NO_STATE; @@ -425,42 +408,67 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) const size_t state_count = fsm_countstates(nfa); assert(ainfo->start < state_count); - { - const struct fsm_state *s = &nfa->states[ainfo->start]; + struct state_set **eclosures = epsilon_closure(nfa); + if (eclosures == NULL) { + return false; + } + /* First pass: Iterate over the start state's epsilon edges, + * attempting to identify the unanchored start loop and anchored + * start states (if present). + * + * Note: This uses the start state's epsilon set rather than its + * epsilon closure because (by construction) the unanchored + * start loop and anchored start states will both be directly + * connected to the start state. Using the epsilon closure can + * mis-identify the unanchored *end* loop as the start loop, if + * there is a path with only epsilon edges between them. */ + struct state_iter si; + state_set_reset(nfa->states[ainfo->start].epsilons, &si); + fsm_state_t ns_i; + while (state_set_next(&si, &ns_i)) { + if (ns_i == ainfo->start) { continue; } + + /* If there's a state in the start state's epsilon set that + * has a dot self-edge, it's the unanchored start loop. */ + if (has_dot_self_edge(nfa, ns_i)) { + if (LOG_ANALYZE_GROUP_NFA) { + fprintf(stderr, "%s: unanchored_start_loop found on state %d\n", __func__, ns_i); + } + /* there can be only one */ + assert(ainfo->unanchored_start_loop == NO_STATE + || ainfo->unanchored_start_loop == ns_i); + ainfo->unanchored_start_loop = ns_i; + continue; + } else { + /* Otherwise, a state without a dot self-edge is the anchored start. */ + if (LOG_ANALYZE_GROUP_NFA) { + fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i); + } + assert(ainfo->anchored_start == NO_STATE || ainfo->anchored_start == ns_i); + ainfo->anchored_start = ns_i; + continue; + } + } + + /* Copy labeled edges from the unanchored start loop and + * its epsilon closure to ainfo->repeatable_firsts, except + * for edges leading back to the unanchored start loop. */ + if (ainfo->unanchored_start_loop != NO_STATE) { struct state_iter si; - state_set_reset(s->epsilons, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - if (ns_i == ainfo->start) { continue; } + state_set_reset(eclosures[ainfo->unanchored_start_loop], &si); + fsm_state_t cs_i; + while (state_set_next(&si, &cs_i)) { + assert(cs_i < nfa->statecount); + const struct fsm_state *cs = &nfa->states[cs_i]; /* closure state */ struct edge_group_iter egi; struct edge_group_iter_info info; - assert(ns_i < state_count); - const struct fsm_state *ns = &nfa->states[ns_i]; - - /* if there's a state in the start state's epsilon closure that - * has a dot self-edge, it's the unanchored start loop */ - if (has_dot_self_edge(nfa, ns_i)) { - assert(ainfo->unanchored_start_loop == NO_STATE); - ainfo->unanchored_start_loop = ns_i; - - /* Copy labeled edges from the unanchored start loop and its epsilon - * closure to ainfo->repeatable_firsts, except for edges leading back - * to the unanchored start loop. */ - if (!copy_noncycle_labeled_edges_from_epsilon_closure(ainfo, nfa, ns_i)) { - goto alloc_fail; - } - } else { - /* likewise, a state without a dot self-edge is the anchored start */ - assert(ainfo->anchored_start == NO_STATE); - ainfo->anchored_start = ns_i; - - /* copy its labeled edges to ainfo->anchored_firsts */ - edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &egi); - while (edge_set_group_iter_next(&egi, &info)) { - if (!edge_set_add_bulk(&ainfo->anchored_firsts, + edge_set_group_iter_reset(cs->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + if (info.to != ainfo->unanchored_start_loop) { + if (!edge_set_add_bulk(&ainfo->repeatable_firsts, nfa->alloc, info.symbols, info.to)) { goto alloc_fail; } @@ -469,12 +477,33 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) } } + /* Copy labeled edges from the anchored start and its epsilon + * closure to ainfo->anchored_firsts. */ + if (ainfo->anchored_start != NO_STATE) { + struct state_iter si; + state_set_reset(eclosures[ainfo->anchored_start], &si); + fsm_state_t cs_i; + while (state_set_next(&si, &cs_i)) { + assert(cs_i < nfa->statecount); + const struct fsm_state *cs = &nfa->states[cs_i]; + + struct edge_group_iter egi; + struct edge_group_iter_info info; + + edge_set_group_iter_reset(cs->edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + if (!edge_set_add_bulk(&ainfo->anchored_firsts, + nfa->alloc, info.symbols, info.to)) { + goto alloc_fail; + } + } + } + } + /* If the start state always matches, set a flag noting that it will need special handling * later. It's arguably pointless to combine "" with other regexes, because it will always * trivially match, but otherwise it would never match. */ - if (!start_state_epsilon_closure_matches_empty_string(nfa, ainfo->start, &ainfo->nullable)) { - goto alloc_fail; - } + ainfo->nullable = start_state_epsilon_closure_matches_empty_string(nfa, eclosures[ainfo->start]); /* If there's a state with a dot self-edge and an epsilon edge to an end state, it's * the unanchored end loop. There should only be one. */ @@ -499,17 +528,24 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) /* Collect states that lead to an anchored end or eager match. */ for (size_t s_i = 0; s_i < state_count; s_i++) { fsm_state_t dst_end = NO_STATE; - if (has_epsilon_and_newline_edges_to_end(nfa, s_i, &dst_end)) { + if (has_epsilon_and_newline_edges_to_same_end(nfa, eclosures[s_i], s_i, &dst_end)) { if (!state_set_add(&ainfo->anchored_ends, nfa->alloc, dst_end)) { goto alloc_fail; } } - if (has_labeled_edge_to_unanchored_end_loop(nfa, s_i, ainfo->unanchored_end_loop)) { + fsm_state_t indirect_dst = NO_STATE; + if (has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, eclosures, s_i, ainfo->unanchored_end_loop, &indirect_dst)) { if (!state_set_add(&ainfo->eager_matches, nfa->alloc, s_i)) { goto alloc_fail; } } + + if (indirect_dst != NO_STATE) { + if (!state_set_add(&ainfo->needs_indirect_epsilon_edge_to_eager_match_state, nfa->alloc, indirect_dst)) { + goto alloc_fail; + } + } } #if LOG_ANALYZE_GROUP_NFA_RESULTS @@ -522,10 +558,17 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) dump_edge_set(stderr, "repeatable_firsts", ainfo->unanchored_start_loop, ainfo->repeatable_firsts); } #endif + + closure_free(nfa, eclosures, state_count); + return true; alloc_fail: fprintf(stderr, "alloc fail\n"); + if (eclosures != NULL) { + closure_free(nfa, eclosures, state_count); + } + return false; } @@ -534,7 +577,8 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) * the libfsm API for this (and it shouldn't be necessary in general), but * if it gets one later this can be replaced. */ static bool -replace_labeled_edge(struct fsm *nfa, fsm_state_t from_state, fsm_state_t old_to, fsm_state_t new_to) +replace_labeled_edge(struct fsm *nfa, fsm_state_t from_state, fsm_state_t old_to, fsm_state_t new_to, + bool *found) { if (old_to == NO_STATE) { /* nothing to do */ @@ -552,6 +596,7 @@ replace_labeled_edge(struct fsm *nfa, fsm_state_t from_state, fsm_state_t old_to struct edge_group_iter_info info; edge_set_group_iter_reset(old_edges, EDGE_GROUP_ITER_ALL, &iter); while (edge_set_group_iter_next(&iter, &info)) { + if (info.to == old_to) { *found = true; } if (!edge_set_add_bulk(&new_edges, nfa->alloc, info.symbols, info.to == old_to ? new_to : info.to)) { return false; @@ -577,6 +622,7 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t { const bool nullable_and_unanchored_end = ainfo->nullable && ainfo->unanchored_end_loop != NO_STATE; + const bool log = 0 || (LOG_ANALYZE_GROUP_NFA > 0); /* Add the eager match state if there are eager match states * or a nullable unanchored end. This will link to the global NFA's @@ -585,25 +631,47 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t if (!fsm_addstate(nfa, &ainfo->eager_match_state)) { return false; } + if (log) { + fprintf(stderr, "%s: added eager_match_state %d\n", __func__, ainfo->eager_match_state); + } /* Set eager match ID on new eager_match_state. */ const fsm_output_id_t oid = (fsm_output_id_t)(id + id_base); if (!fsm_seteageroutput(nfa, ainfo->eager_match_state, oid)) { return false; } + if (log) { + fprintf(stderr, "%s: set eager_output id %d on eager_match_state %d\n", + __func__, oid, ainfo->eager_match_state); + } /* For every state in eager_matches, replace every edge leading to * the unanchored_end_loop with an edge with the same labels to - * eager_match_state. */ + * eager_match_state. + * + * If the labeled edge does not directly lead to the unanchored_end_loop, + * then add an epsilon edge from wherever it leads to eager_match_state + * instead. */ struct state_iter si; state_set_reset(ainfo->eager_matches, &si); fsm_state_t ems_i; while (state_set_next(&si, &ems_i)) { + bool found = false; if (!replace_labeled_edge(nfa, ems_i, - ainfo->unanchored_end_loop, ainfo->eager_match_state)) { + ainfo->unanchored_end_loop, ainfo->eager_match_state, &found)) { return false; } + if (log) { + if (found) { + fprintf(stderr, "%s: replacing labeled edges from eager_match_state %d to unanchored_end_loop %d with edge to new eager_match_state %d\n", + __func__, ems_i, ainfo->unanchored_end_loop, ainfo->eager_match_state); + } else if (!found && ainfo->unanchored_end_loop != NO_STATE) { + fprintf(stderr, "%s: not found: labeled edges from eager_match_state %d to unanchored_end_loop %d\n", + __func__, ems_i, ainfo->unanchored_end_loop); + } + } + /* The state must not link to the unanchored end loop anymore. * Doing so will cause a combinatorial explosion that makes * combining more ~10 NFAs incredibly expensive. */ @@ -616,6 +684,16 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t assert(info.to != ainfo->unanchored_end_loop); } } + + state_set_reset(ainfo->needs_indirect_epsilon_edge_to_eager_match_state, &si); + fsm_state_t intermediate_i; + while (state_set_next(&si, &intermediate_i)) { + if (!fsm_addedge_epsilon(nfa, intermediate_i, ainfo->eager_match_state)) { return false; } + if (log) { + fprintf(stderr, "%s: adding epsilon edge from intermediate eager match state %d to new eager_match_state %d\n", + __func__, intermediate_i, ainfo->eager_match_state); + } + } } /* If the group NFA matches the empty string and has an unanchored end, then @@ -630,6 +708,10 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t if (!state_set_add(&s->epsilons, nfa->alloc, ainfo->eager_match_state)) { return false; } + if (log) { + fprintf(stderr, "%s: adding epsilon edge from unanchored_start_loop %d to eager_match_state %d\n", + __func__, ainfo->unanchored_start_loop, ainfo->eager_match_state); + } } } @@ -643,9 +725,21 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t if (!fsm_endid_set(nfa, anchored_end_state, end_id)) { return false; } + if (log) { + fprintf(stderr, "%s: setting endid %d on anchored_end_state %d\n", + __func__, end_id, anchored_end_state); + } } } +#if LOG_AFTER_MODIFY_GROUP_NFA + fprintf(stderr, "=== after %s\n", __func__); + fsm_print(stderr, nfa, &dump_nfa_opt, NULL, FSM_PRINT_DOT); + fsm_endid_dump(stderr, nfa); + fsm_eager_output_dump(stderr, nfa); + +#endif + return true; } @@ -740,6 +834,8 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addstate(res, &global_end)) { goto fail; } if (!fsm_addstate(res, &global_unanchored_end_loop)) { goto fail; } + /* do this later, combining NFAs may rebase the state IDs */ +#if 0 /* link the start to the global unanchored start loop and anchored start. */ if (log) { fprintf(stderr, "link_before: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", @@ -757,6 +853,7 @@ fsm_union_repeated_pattern_group(size_t nfa_count, } if (!fsm_addedge_any(res, global_unanchored_end_loop, global_unanchored_end_loop)) { goto fail; } if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } +#endif if (bases != NULL) { memset(bases, 0x00, nfa_count * sizeof(bases[0])); @@ -781,6 +878,10 @@ fsm_union_repeated_pattern_group(size_t nfa_count, struct fsm *merged = fsm_merge(res, fsm, &combine_info); if (merged == NULL) { goto fail; } + if (log) { + fprintf(stderr, "merged: bases a %d and b %d\n", combine_info.base_a, combine_info.base_b); + } + /* Update offsets if res had its state IDs shifted forward. */ global_start += combine_info.base_a; global_unanchored_start_loop += combine_info.base_a; @@ -807,6 +908,11 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addedge_epsilon(merged, ainfo->eager_match_state, global_end)) { goto fail; } + if (log) { + fprintf(stderr, "eager_match_state: adding epsilon EMS %d -> global_unanchored_end_loop %d and EMS %d -> global_end %d\n", + ainfo->eager_match_state, global_unanchored_end_loop, + ainfo->eager_match_state, global_end); + } /* If the NFA matches the empty string and is not anchored at the end, then * add an epsilon edge from the global start directly to its eager match state. @@ -816,6 +922,10 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addedge_epsilon(merged, global_start, ainfo->eager_match_state)) { goto fail; } + if (log) { + fprintf(stderr, "nullable: global_start %d -> eager_match_state %d\n", + global_start, ainfo->eager_match_state); + } } } else { /* If the NFA matches an end-anchored empty string, then add an epsilon edge from @@ -829,6 +939,11 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addedge_epsilon(merged, global_start, anchored_end_state)) { goto fail; } + if (log) { + fprintf(stderr, "nullable & anchored ends: global_start %d -> anchored_end_state %d\n", + global_start, anchored_end_state); + } + /* It should only be necessary to link one, since that's enough * for determinisation to carry the end id back to the start's * epsilon closure. */ @@ -850,6 +965,10 @@ fsm_union_repeated_pattern_group(size_t nfa_count, info.symbols, info.to)) { goto fail; } + if (log) { + fprintf(stderr, "anchored_firsts: adding global_anchored_start %d -> info.to %d (with same labels)\n", + global_anchored_start, info.to); + } } /* Link the global_unanchored_start_loop to group FSM paths that aren't @@ -861,15 +980,53 @@ fsm_union_repeated_pattern_group(size_t nfa_count, info.symbols, info.to)) { goto fail; } + + if (log) { + fprintf(stderr, "repeatable_firsts: adding global_unanchored_start_loop %d -> info.to %d (same edges)\n", + global_unanchored_start_loop, info.to); + } + } + + + /* Add an epsilon edge from the global unanchored start loop to the NFA's. + * Without this, eager outputs for eager matches in the start state's epsilon + * closure may get lost during determinisation. */ + if (ainfo->unanchored_start_loop != NO_STATE) { + if (!fsm_addedge_epsilon(merged, global_unanchored_start_loop, ainfo->unanchored_start_loop)) { + goto fail; + } + + if (log) { + fprintf(stderr, "repeatable_firsts: adding an epsilon edge from global_unanchored_start_loop %d to NFA unanchored_start_loop %d\n", + global_unanchored_start_loop, ainfo->unanchored_start_loop); + } } res = merged; } + /* link the start to the global unanchored start loop and anchored start. */ + if (log) { + fprintf(stderr, "linking: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", + global_start, global_unanchored_start_loop, global_anchored_start); + } + if (!fsm_addedge_epsilon(res, global_start, global_unanchored_start_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_start, global_anchored_start)) { goto fail; } + + /* Link the global unanchored start loop to itself. */ + if (!fsm_addedge_any(res, global_unanchored_start_loop, global_unanchored_start_loop)) { goto fail; } + + /* Link the global unanchored end loop and global end. */ + if (log) { + fprintf(stderr, "linking: global_unanchored_end_loop %d -> global_end %d (and -> self)\n", global_unanchored_end_loop, global_end); + } + if (!fsm_addedge_any(res, global_unanchored_end_loop, global_unanchored_end_loop)) { goto fail; } + if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } + /* Link from the global_unanchored_end_loop to the global_unanchored_start_loop, * so patterns with an unanchored start can follow other patterns with an unanchored * end, possibly with other ignored input between them. */ - if (log) { + if (log || LOG_FSM_UNION_REPEATED_PATTERN_GROUP_OUTPUT) { fprintf(stderr, "%s: g_start %d, g_start_loop %d, g_start_anchored %d, g_end_loop %d, g_end %d (after all merging)\n", __func__, global_start, global_unanchored_start_loop, global_anchored_start, global_unanchored_end_loop, global_end); fprintf(stderr, "%s: linking global_unanchored_end_loop %d to global_unanchored_start_loop %d\n", @@ -887,6 +1044,13 @@ fsm_union_repeated_pattern_group(size_t nfa_count, free_analysis(alloc, &ainfos[i]); } + if (LOG_UNION_REPEATED_PATTERN_GROUP || LOG_FSM_UNION_REPEATED_PATTERN_GROUP_OUTPUT) { + fprintf(stderr, "==== %s output (combined, pre det+min)\n", __func__); + fsm_print(stderr, res, &dump_nfa_opt, NULL, FSM_PRINT_DOT); + fsm_endid_dump(stderr, res); + fsm_eager_output_dump(stderr, res); + } + f_free(alloc, ainfos); return res; diff --git a/tests/eager_output/eager_output_unanchored_end_plus.c b/tests/eager_output/eager_output_unanchored_end_plus.c new file mode 100644 index 000000000..81354ffcb --- /dev/null +++ b/tests/eager_output/eager_output_unanchored_end_plus.c @@ -0,0 +1,19 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "abcx+", + "z", + }, + .inputs = { + { .input = "abc", .expect_fail = true }, + { .input = "abcx", .expected_ids = { 1 } }, + { .input = "abcxx", .expected_ids = { 1 } }, + { .input = "z", .expected_ids = { 2 } }, + }, + }; + + return run_test(&test); +} diff --git a/tests/eager_output/eager_output_unanchored_end_plus_min.c b/tests/eager_output/eager_output_unanchored_end_plus_min.c new file mode 100644 index 000000000..3d270514e --- /dev/null +++ b/tests/eager_output/eager_output_unanchored_end_plus_min.c @@ -0,0 +1,18 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "abcx+", + }, + .inputs = { + { .input = "abc", .expect_fail = true }, + { .input = "abcx", .expected_ids = { 1 } }, + { .input = "abcxx", .expected_ids = { 1 } }, + { .input = "z", .expect_fail = true }, + }, + }; + + return run_test(&test); +} diff --git a/tests/eager_output/eager_output_unanchored_end_plus_min2.c b/tests/eager_output/eager_output_unanchored_end_plus_min2.c new file mode 100644 index 000000000..20d735fcc --- /dev/null +++ b/tests/eager_output/eager_output_unanchored_end_plus_min2.c @@ -0,0 +1,19 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "z", + }, + .inputs = { + { .input = "abc", .expect_fail = true }, + { .input = "z", .expected_ids = { 1 } }, + { .input = "Xz", .expected_ids = { 1 } }, + { .input = "XzX", .expected_ids = { 1 } }, + { .input = "zX", .expected_ids = { 1 } }, + }, + }; + + return run_test(&test); +} diff --git a/tests/eager_output/eager_output_unanchored_end_plus_multichar.c b/tests/eager_output/eager_output_unanchored_end_plus_multichar.c new file mode 100644 index 000000000..40d3e5d24 --- /dev/null +++ b/tests/eager_output/eager_output_unanchored_end_plus_multichar.c @@ -0,0 +1,19 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "abc(xy)+", + "z", + }, + .inputs = { + { .input = "abc", .expect_fail = true }, + { .input = "abcxy", .expected_ids = { 1 } }, + { .input = "abcxyxy", .expected_ids = { 1 } }, + { .input = "z", .expected_ids = { 2 } }, + }, + }; + + return run_test(&test); +} From 7b8b16948241a40253f191339848995f30443b9c Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 12 Feb 2025 12:12:15 -0500 Subject: [PATCH 41/80] Interface change: Add 'const'. --- src/libfsm/closure.c | 4 ++-- src/libfsm/internal.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libfsm/closure.c b/src/libfsm/closure.c index 3993afcda..165fa4964 100644 --- a/src/libfsm/closure.c +++ b/src/libfsm/closure.c @@ -128,7 +128,7 @@ epsilon_closure_single(const struct fsm *fsm, struct state_set **closures, fsm_s } struct state_set ** -epsilon_closure(struct fsm *fsm) +epsilon_closure(const struct fsm *fsm) { struct state_set **closures; fsm_state_t s; @@ -190,7 +190,7 @@ epsilon_closure(struct fsm *fsm) } void -closure_free(struct fsm *fsm, struct state_set **closures, size_t n) +closure_free(const struct fsm *fsm, struct state_set **closures, size_t n) { fsm_state_t s; diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index 46997c82a..094723fdb 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -94,10 +94,10 @@ state_hasnondeterminism(const struct fsm *fsm, fsm_state_t state, struct bm *bm) * for states, with wrapper to populate malloced array of user-facing structs. */ struct state_set ** -epsilon_closure(struct fsm *fsm); +epsilon_closure(const struct fsm *fsm); void -closure_free(struct fsm *fsm, struct state_set **closures, size_t n); +closure_free(const struct fsm *fsm, struct state_set **closures, size_t n); /* * Internal free function that invokes free(3) by default, or a user-provided From 6b07bd2dfd943bda45d6d8e5641f18b6118f1ed1 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 12 Feb 2025 14:56:49 -0500 Subject: [PATCH 42/80] union: Fix trivial memory leak. --- src/libfsm/union.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 9acdddf01..7ef7bb278 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -770,6 +770,7 @@ free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) { state_set_free(ainfo->anchored_ends); state_set_free(ainfo->eager_matches); + state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state); edge_set_free(alloc, ainfo->anchored_firsts); edge_set_free(alloc, ainfo->repeatable_firsts); } From 7546f81e4b461f9b42ec70ec748eb42daae0a62b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 12 Feb 2025 14:57:01 -0500 Subject: [PATCH 43/80] union.c: Add comments for assertions. Fuzzing has produced inputs that cause this to fail, but they all depend on embedded '\0' characters. I wasn't able to reproduce the failure without those present, but I will investigate further later. For now, adding a TODO. --- src/libfsm/union.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 7ef7bb278..a0790c3f9 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -435,7 +435,17 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) if (LOG_ANALYZE_GROUP_NFA) { fprintf(stderr, "%s: unanchored_start_loop found on state %d\n", __func__, ns_i); } - /* there can be only one */ + + /* TODO: There is only one unanchored start loop, but in obscure cases it may + * be difficult to distinguish between the USL and the unanchored end loop or + * other intermediate .* loops. The real USL will strictly appear before any + * other such loops in the graph. + * + * For now, assert that there is only one, because it's safer to have this + * loudly fail at compile time than produce an incorrect graph. Fuzzing has + * produced some inputs that make this fail, but currently they seem to + * depend on having a '\0' character embedded in the middle, which would + * normally be rejected by this point. */ assert(ainfo->unanchored_start_loop == NO_STATE || ainfo->unanchored_start_loop == ns_i); ainfo->unanchored_start_loop = ns_i; @@ -445,7 +455,10 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) if (LOG_ANALYZE_GROUP_NFA) { fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i); } + + /* TODO: This, too, can fail in obscure cases and needs further investigation. */ assert(ainfo->anchored_start == NO_STATE || ainfo->anchored_start == ns_i); + ainfo->anchored_start = ns_i; continue; } From bb9f6203274d6ffbffa94ab52905a8b978deae6a Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 14 Feb 2025 13:41:24 -0500 Subject: [PATCH 44/80] Switch to collecting an anchored_start state set, not just one state. `(^|wax-)((?:banana|^apple))` is an example of a regex that needs multiple anchored_start states linked in order to combine correctly. --- src/libfsm/union.c | 22 ++++++++++--------- ...ger_output_mixed_start_anchor_regression.c | 20 +++++++++++++++++ 2 files changed, 32 insertions(+), 10 deletions(-) create mode 100644 tests/eager_output/eager_output_mixed_start_anchor_regression.c diff --git a/src/libfsm/union.c b/src/libfsm/union.c index a0790c3f9..9062e1dbd 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -48,8 +48,8 @@ struct analysis_info { /* The end state following the unanchored end loop. */ fsm_state_t unanchored_end_loop_end; - /* State that links to paths only reachable from the beginning of input. */ - fsm_state_t anchored_start; + /* States that link to paths only reachable from the beginning of input. */ + struct state_set *anchored_starts; /* States leading to an anchored end. */ struct state_set *anchored_ends; @@ -396,7 +396,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) memset(ainfo, 0x00, sizeof(*ainfo)); ainfo->start = NO_STATE; ainfo->unanchored_start_loop = NO_STATE; - ainfo->anchored_start = NO_STATE; ainfo->unanchored_end_loop = NO_STATE; ainfo->unanchored_end_loop_end = NO_STATE; ainfo->eager_match_state = NO_STATE; @@ -456,10 +455,9 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i); } - /* TODO: This, too, can fail in obscure cases and needs further investigation. */ - assert(ainfo->anchored_start == NO_STATE || ainfo->anchored_start == ns_i); - - ainfo->anchored_start = ns_i; + if (!state_set_add(&ainfo->anchored_starts, nfa->alloc, ns_i)) { + goto alloc_fail; + } continue; } } @@ -492,9 +490,12 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) /* Copy labeled edges from the anchored start and its epsilon * closure to ainfo->anchored_firsts. */ - if (ainfo->anchored_start != NO_STATE) { + struct state_iter si_anchored_start; + state_set_reset(ainfo->anchored_starts, &si_anchored_start); + fsm_state_t anchored_start; + while (state_set_next(&si_anchored_start, &anchored_start)) { struct state_iter si; - state_set_reset(eclosures[ainfo->anchored_start], &si); + state_set_reset(eclosures[anchored_start], &si); fsm_state_t cs_i; while (state_set_next(&si, &cs_i)) { assert(cs_i < nfa->statecount); @@ -766,11 +767,11 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) SHIFT(unanchored_start_loop); SHIFT(unanchored_end_loop); SHIFT(unanchored_end_loop_end); - SHIFT(anchored_start); SHIFT(eager_match_state); #undef SHIFT state_set_rebase(&ainfo->anchored_ends, base); + state_set_rebase(&ainfo->anchored_starts, base); state_set_rebase(&ainfo->eager_matches, base); edge_set_rebase(&ainfo->anchored_firsts, base); @@ -782,6 +783,7 @@ static void free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) { state_set_free(ainfo->anchored_ends); + state_set_free(ainfo->anchored_starts); state_set_free(ainfo->eager_matches); state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state); edge_set_free(alloc, ainfo->anchored_firsts); diff --git a/tests/eager_output/eager_output_mixed_start_anchor_regression.c b/tests/eager_output/eager_output_mixed_start_anchor_regression.c new file mode 100644 index 000000000..2965dc7aa --- /dev/null +++ b/tests/eager_output/eager_output_mixed_start_anchor_regression.c @@ -0,0 +1,20 @@ +#include "utils.h" + +/* Regression: This is a case that requires an anchored_start state set + * rather than a single optional anchored_start state ID to link + * correctly. */ + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "(^|wax-)((?:banana|^apple))", + "(^|wax-)(orange)", + }, + .inputs = { + { .input = "banana", .expected_ids = { 1 } }, + }, + }; + + return run_test(&test); +} From a54095a4813ee6cfc7d0d3dfe8f3a7abf3094e51 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 19 Sep 2025 12:13:03 -0400 Subject: [PATCH 45/80] Rename test file -- make tests looks for build files matching "*res*". `build/tests/eager_output/run_mixed_start_anchor_regression` matches "*res*" because of "regression", so `make test` incorrectly treats it like any of the other output test result files. Rename it, to remove "regression". This is an unexpected consequence of cb42d58f. --- ...tart_anchor_regression.c => eager_output_mixed_start_anchor.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/eager_output/{eager_output_mixed_start_anchor_regression.c => eager_output_mixed_start_anchor.c} (100%) diff --git a/tests/eager_output/eager_output_mixed_start_anchor_regression.c b/tests/eager_output/eager_output_mixed_start_anchor.c similarity index 100% rename from tests/eager_output/eager_output_mixed_start_anchor_regression.c rename to tests/eager_output/eager_output_mixed_start_anchor.c From eca233da96c54edc6f8b148a1c244763f8a19feb Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Tue, 9 Sep 2025 17:18:57 -0400 Subject: [PATCH 46/80] union.c: Updates to fsm_union_repeated_pattern_group and its internals. - Remove some dead code. - Update several comments. - Rename a couple functions to clarify their use. - Refine detection of the unanchored start loop. --- src/libfsm/union.c | 224 +++++++++++++++++++++++++++------------------ 1 file changed, 134 insertions(+), 90 deletions(-) diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 9062e1dbd..7e011af8e 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -74,6 +74,11 @@ struct analysis_info { /* These states need an epsilon edge added to the eager_matched_state. */ struct state_set *needs_indirect_epsilon_edge_to_eager_match_state; + + /* States which are reachable from any state besides the start + * state. This can be necessary to correctly identify the + * unanchored start loop. */ + struct state_set *reachable_from_nonstart_state; }; struct fsm * @@ -209,7 +214,7 @@ fsm_union_array(size_t fsm_count, } static bool -has_dot_self_edge(const struct fsm *nfa, fsm_state_t s_i) +state_has_dot_self_edge(const struct fsm *nfa, fsm_state_t s_i) { const struct fsm_state *s = &nfa->states[s_i]; @@ -260,15 +265,15 @@ dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set #endif /* For each state in the epsilon closure, if there's a labeled edge - * to an end state, check if the label set is only [\n] and there's - * also an epsilon edge to the same end state. + * to an end state with no outgoing edges, check if the label set is + * only [\n] and there's also an epsilon edge to the same end state. * If so, this represents an anchored end in the NFA. */ static bool -has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, struct state_set *eclosure, - fsm_state_t s_i, fsm_state_t *dst_end) +state_has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, + fsm_state_t s_id, struct state_set *s_eclosure, fsm_state_t *dst_end) { struct state_iter si; - state_set_reset(eclosure, &si); + state_set_reset(s_eclosure, &si); fsm_state_t ns_i; while (state_set_next(&si, &ns_i)) { assert(ns_i < nfa->statecount); @@ -289,11 +294,18 @@ has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, struct state_se /* If it's an end, look for an epsilon leeding to the same destination */ if (fsm_isend(nfa, info.to)) { + assert(info.to < nfa->statecount); + const struct fsm_state *end_candidate = &nfa->states[info.to]; + if (!state_set_empty(end_candidate->epsilons) || + !edge_set_empty(end_candidate->edges)) { + continue; /* not an anchored end */ + } + struct state_iter inner_si; fsm_state_t os_i; - assert(s_i < nfa->statecount); - const struct fsm_state *s = &nfa->states[s_i]; + assert(s_id < nfa->statecount); + const struct fsm_state *s = &nfa->states[s_id]; state_set_reset(s->epsilons, &inner_si); while (state_set_next(&inner_si, &os_i)) { @@ -310,50 +322,39 @@ has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, struct state_se } static bool -has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa, - struct state_set **eclosures, - fsm_state_t s_i, fsm_state_t unanchored_end_loop, +state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa, + fsm_state_t s_i, struct state_set **eclosures, + fsm_state_t unanchored_end_loop, fsm_state_t *indirect_dst) { if (unanchored_end_loop == NO_STATE) { return false; } assert(unanchored_end_loop < nfa->statecount); assert(s_i < nfa->statecount); - const struct state_set *s_eclosure = eclosures[s_i]; - - /* For every state in s_i's epsilon closure, check if it has - * a labeled edge to a state with the unanchored_end_loop - * in its epsilon closure. */ - struct state_iter si; - state_set_reset(s_eclosure, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - /* The unanchored_end_loop's self-edge doesn't count here. */ - if (ns_i == unanchored_end_loop) { continue; } - /* FIXME: this should only apply to the original state, not its epsilon closure...right? */ - if (ns_i != s_i) { continue; } + /* The unanchored_end_loop doesn't count, here. */ + if (s_i == unanchored_end_loop) { return false; } - assert(ns_i < nfa->statecount); - const struct fsm_state *ns = &nfa->states[ns_i]; - struct edge_group_iter iter; - struct edge_group_iter_info info; - edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - assert(info.to < nfa->statecount); - const struct state_set *to_eclosure = eclosures[info.to]; - - struct state_iter dst_si; - state_set_reset(to_eclosure, &dst_si); - fsm_state_t dst_s_i; - while (state_set_next(&dst_si, &dst_s_i)) { - if (dst_s_i == unanchored_end_loop) { - if (info.to != unanchored_end_loop) { - *indirect_dst = info.to; - } - - return true; + /* Check whether the state has a labeled edge to a state with the + * unanchored_end_loop in its epsilon closure. */ + const struct fsm_state *s = &nfa->states[s_i]; + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + assert(info.to < nfa->statecount); + const struct state_set *to_eclosure = eclosures[info.to]; + + struct state_iter dst_si; + state_set_reset(to_eclosure, &dst_si); + fsm_state_t dst_s_i; + while (state_set_next(&dst_si, &dst_s_i)) { + if (dst_s_i == unanchored_end_loop) { + if (info.to != unanchored_end_loop) { + *indirect_dst = info.to; } + + return true; } } } @@ -412,9 +413,41 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) return false; } - /* First pass: Iterate over the start state's epsilon edges, - * attempting to identify the unanchored start loop and anchored - * start states (if present). + /* Mark any states that are reachable from any state besides the + * start state -- this means they cannot be the unanchored start + * loop, in cases where the pass below would otherwise detect + * more than one. */ + for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { + if (s_i == ainfo->start) { continue; } + + struct state_iter si; + state_set_reset(nfa->states[s_i].epsilons, &si); + fsm_state_t eps_i; + while (state_set_next(&si, &eps_i)) { + /* Ignore self edges */ + if (eps_i == s_i) { continue; } + + if (!state_set_add(&ainfo->reachable_from_nonstart_state, nfa->alloc, eps_i)) { + return false; + } + } + + struct edge_group_iter egi; + struct edge_group_iter_info info; + edge_set_group_iter_reset(nfa->states[s_i].edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + /* Ignore self edges */ + if (info.to == s_i) { continue; } + + if (!state_set_add(&ainfo->reachable_from_nonstart_state, nfa->alloc, info.to)) { + return false; + } + } + } + + /* Iterate over the start state's epsilon edges, attempting to + * identify the unanchored start loop and anchored start states + * (if present). * * Note: This uses the start state's epsilon set rather than its * epsilon closure because (by construction) the unanchored @@ -426,31 +459,55 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) state_set_reset(nfa->states[ainfo->start].epsilons, &si); fsm_state_t ns_i; while (state_set_next(&si, &ns_i)) { + /* Ignore self edges. */ if (ns_i == ainfo->start) { continue; } /* If there's a state in the start state's epsilon set that * has a dot self-edge, it's the unanchored start loop. */ - if (has_dot_self_edge(nfa, ns_i)) { + if (state_has_dot_self_edge(nfa, ns_i)) { if (LOG_ANALYZE_GROUP_NFA) { fprintf(stderr, "%s: unanchored_start_loop found on state %d\n", __func__, ns_i); } - /* TODO: There is only one unanchored start loop, but in obscure cases it may - * be difficult to distinguish between the USL and the unanchored end loop or - * other intermediate .* loops. The real USL will strictly appear before any - * other such loops in the graph. + /* By construction, the true unanchored start loop is only reachable + * via an epsilon edge from the start state, so if any other state + * has an epsilon or labeled edge to this one, it cannot be the + * unanchored start loop. + * + * This is necessary for cases like '^|x', which produces: + * + * 0 -> 2; + * 0 -> 3; + * 2 -> 2 "\x00" .. "\xff"; + * 2 -> 3 "x"; + * 3 -> 1; + * 3 -> 3 ?; * - * For now, assert that there is only one, because it's safer to have this - * loudly fail at compile time than produce an incorrect graph. Fuzzing has - * produced some inputs that make this fail, but currently they seem to - * depend on having a '\0' character embedded in the middle, which would - * normally be rejected by this point. */ - assert(ainfo->unanchored_start_loop == NO_STATE - || ainfo->unanchored_start_loop == ns_i); + * start: 0; + * end: 1 = [0]; + * + * where this analysis would otherwise identify both 2 (correct) + * and 3 (incorrect) as the unanchored start loop. Both are reachable + * from the start state via an epsilon edge, but the labeled edge + * 2->3 'x' rules 3 out. + * */ + if (state_set_contains(ainfo->reachable_from_nonstart_state, ns_i)) { + continue; + } + + /* The reachable_from_nonstart_state check handles the other cases, + * but for `$|^` other attempts to distinguish them will fail, + * but by construction the USL will have the earlier state ID. */ + if (ainfo->unanchored_start_loop != NO_STATE && + ainfo->unanchored_start_loop < ns_i) { + continue; + } + ainfo->unanchored_start_loop = ns_i; continue; } else { - /* Otherwise, a state without a dot self-edge is the anchored start. */ + /* Otherwise, a state without a dot self-edge is an anchored start. + * There may be more than one. */ if (LOG_ANALYZE_GROUP_NFA) { fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i); } @@ -523,7 +580,7 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) * the unanchored end loop. There should only be one. */ for (size_t s_i = 0; s_i < state_count; s_i++) { const struct fsm_state *s = &nfa->states[s_i]; - if (has_dot_self_edge(nfa, s_i)) { + if (state_has_dot_self_edge(nfa, s_i)) { struct state_iter si; state_set_reset(s->epsilons, &si); fsm_state_t ns_i; @@ -542,14 +599,14 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) /* Collect states that lead to an anchored end or eager match. */ for (size_t s_i = 0; s_i < state_count; s_i++) { fsm_state_t dst_end = NO_STATE; - if (has_epsilon_and_newline_edges_to_same_end(nfa, eclosures[s_i], s_i, &dst_end)) { + if (state_has_epsilon_and_newline_edges_to_same_end(nfa, s_i, eclosures[s_i], &dst_end)) { if (!state_set_add(&ainfo->anchored_ends, nfa->alloc, dst_end)) { goto alloc_fail; } } fsm_state_t indirect_dst = NO_STATE; - if (has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, eclosures, s_i, ainfo->unanchored_end_loop, &indirect_dst)) { + if (state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, s_i, eclosures, ainfo->unanchored_end_loop, &indirect_dst)) { if (!state_set_add(&ainfo->eager_matches, nfa->alloc, s_i)) { goto alloc_fail; } @@ -773,6 +830,7 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) state_set_rebase(&ainfo->anchored_ends, base); state_set_rebase(&ainfo->anchored_starts, base); state_set_rebase(&ainfo->eager_matches, base); + state_set_rebase(&ainfo->reachable_from_nonstart_state, base); edge_set_rebase(&ainfo->anchored_firsts, base); edge_set_rebase(&ainfo->repeatable_firsts, base); @@ -786,6 +844,7 @@ free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) state_set_free(ainfo->anchored_starts); state_set_free(ainfo->eager_matches); state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state); + state_set_free(ainfo->reachable_from_nonstart_state); edge_set_free(alloc, ainfo->anchored_firsts); edge_set_free(alloc, ainfo->repeatable_firsts); } @@ -838,9 +897,11 @@ fsm_union_repeated_pattern_group(size_t nfa_count, fsm_state_t global_start; if (!fsm_addstate(res, &global_start)) { goto fail; } - /* States linking to the starts of unanchored and anchored subgraphs, respectively. - * Matching other group NFAs loops back to the global_unanchored_start_loop, but - * patterns anchored at the ^start are only reachable via global_anchored_start. */ + /* States linking to the starts of unanchored and anchored + * subgraphs, respectively. Matching group NFAs with unanchored + * ends will loop back to the global_unanchored_start_loop, but + * patterns anchored at the start are only reachable via + * global_anchored_start. */ fsm_state_t global_unanchored_start_loop, global_anchored_start; if (!fsm_addstate(res, &global_unanchored_start_loop)) { goto fail; } if (!fsm_addstate(res, &global_anchored_start)) { goto fail; } @@ -850,27 +911,6 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addstate(res, &global_end)) { goto fail; } if (!fsm_addstate(res, &global_unanchored_end_loop)) { goto fail; } - /* do this later, combining NFAs may rebase the state IDs */ -#if 0 - /* link the start to the global unanchored start loop and anchored start. */ - if (log) { - fprintf(stderr, "link_before: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", - global_start, global_unanchored_start_loop, global_anchored_start); - } - if (!fsm_addedge_epsilon(res, global_start, global_unanchored_start_loop)) { goto fail; } - if (!fsm_addedge_epsilon(res, global_start, global_anchored_start)) { goto fail; } - - /* Link the global unanchored start loop to itself. */ - if (!fsm_addedge_any(res, global_unanchored_start_loop, global_unanchored_start_loop)) { goto fail; } - - /* Link the global unanchored end loop and global end. */ - if (log) { - fprintf(stderr, "link_before: global_unanchored_end_loop %d -> global_end %d (and -> self)\n", global_unanchored_end_loop, global_end); - } - if (!fsm_addedge_any(res, global_unanchored_end_loop, global_unanchored_end_loop)) { goto fail; } - if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } -#endif - if (bases != NULL) { memset(bases, 0x00, nfa_count * sizeof(bases[0])); } @@ -889,7 +929,7 @@ fsm_union_repeated_pattern_group(size_t nfa_count, } assert(ainfo->start < state_count); - /* Call fsm_merge; we really don't care which is which. */ + /* Call fsm_merge; the argument order shouldn't matter. */ struct fsm_combine_info combine_info; struct fsm *merged = fsm_merge(res, fsm, &combine_info); if (merged == NULL) { goto fail; } @@ -1021,7 +1061,7 @@ fsm_union_repeated_pattern_group(size_t nfa_count, res = merged; } - /* link the start to the global unanchored start loop and anchored start. */ + /* Link the global start to the global unanchored start loop and anchored start states. */ if (log) { fprintf(stderr, "linking: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", global_start, global_unanchored_start_loop, global_anchored_start); @@ -1029,7 +1069,8 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addedge_epsilon(res, global_start, global_unanchored_start_loop)) { goto fail; } if (!fsm_addedge_epsilon(res, global_start, global_anchored_start)) { goto fail; } - /* Link the global unanchored start loop to itself. */ + /* Link the global unanchored start loop to itself, so it can + * consume and ignore input preceding each matching group NFA. */ if (!fsm_addedge_any(res, global_unanchored_start_loop, global_unanchored_start_loop)) { goto fail; } /* Link the global unanchored end loop and global end. */ @@ -1050,9 +1091,12 @@ fsm_union_repeated_pattern_group(size_t nfa_count, fprintf(stderr, "%s: setting global_start %d and end %d\n", __func__, global_start, global_end); } if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_unanchored_start_loop)) { goto fail; } + + /* Link the global unanchored end loop to the global end, so + * reaching the end of input there is considered a match. */ if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } - /* This needs to be set after merging, because that clears the start state. */ + /* These need to be set after merging, because that clears the start state. */ fsm_setstart(res, global_start); fsm_setend(res, global_end, 1); From e7125757382b1a13b625968d67a6392c98ed5715 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 15 Sep 2025 16:44:11 -0400 Subject: [PATCH 47/80] Eager outputs: Comment on use, rename some functions, clean up. --- include/fsm/fsm.h | 97 ++++++++++++++++++++----------- src/libfsm/clone.c | 2 +- src/libfsm/consolidate.c | 2 +- src/libfsm/determinise.c | 2 +- src/libfsm/eager_output.c | 26 ++++++--- src/libfsm/epsilons.c | 2 +- src/libfsm/libfsm.syms | 6 +- src/libfsm/merge.c | 2 +- src/libfsm/minimise_test_oracle.c | 4 +- src/libfsm/print/ir.c | 2 +- src/libfsm/union.c | 2 +- 11 files changed, 95 insertions(+), 52 deletions(-) diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index f78d91d71..d57d43b1e 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -270,36 +270,48 @@ fsm_mapendids(struct fsm * fsm, fsm_endid_remap_fun remap, void *opaque); void fsm_increndids(struct fsm * fsm, int delta); -/* Associate an eagerly matched numeric ID with the end states in an fsm. - * - * This is similar to fsm_setendid, but has different performance - * trade-offs. In particular, it can become extremely expensive to - * combine multiple DFAs with endids on their end states when they - * representing regexes with unanchored ends, because the FSM has to - * explicitly represent all the possible combinations of matches by - * copying the entire path to every reachable end state. Eager endids - * are associated with the edge leaving the main pattern match. - * - * Returns 1 on success, 0 on error. - * */ -int -fsm_seteagerendid(struct fsm *fsm, fsm_end_id_t id); - /* Set an eager output ID to emit every time the state is entered. - * This turns the automata into a Moore machine. */ + * This is similar to fsm_setendid, but has different performance + * trade-offs for determinisation, and can be applied to + * non-end states. + * + * During DFA execution, states with eager outputs will output their + * ID when output reaches them. With fsm_exec, this happens via a + * callback (see fsm_eager_output_set_cb). Some print languages + * will eventually eager outputs. + * + * One use case for eager outputs is combining multiple unanchored + * regexes into a single DFA and detecting when input matches more than + * one of them. With endids, determinisation has to represent every + * possible reachable combination of endids as a distinct copy of the + * DFA subgraph, leading to a combinatorial explosion that makes + * combining more than a 8 or so regexes (even very simple ones) + * prohibitively expensive. With eager outputs, the graph no longer + * needs a separate subgraph copy for each combination of IDs, so it is + * possible to combine several dozen or even hundreds of FSMs into a + * single DFA. See fsm_union_repeated_pattern_group for more details. */ int -fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); +fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); /* Set an eager output ID on all current end states. */ int -fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id); +fsm_eager_output_set_on_ends(struct fsm *fsm, fsm_output_id_t id); -/* HACK */ +/* Callback for eager output processing. + * If set (using fsm_eager_output_set_cb), this may be called while fsm_exec runs. */ typedef void fsm_eager_output_cb(fsm_output_id_t id, void *opaque); + +/* Set a callback and opaque argument on an FSM for eager outputs encountered + * while fsm_exec is running. Rather than adding another pair of arguments to + * fsm_exec, this is called as a separate step -- most DFAs will not use eager + * outputs, or use them with code generation rather than fsm_exec. + * + * See fsm_eager_output_set for more details about eager output functionality. */ void fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque); +/* Get the eager output callback set on a FSM and its opaque pointer, if any. */ void fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque); @@ -307,11 +319,39 @@ fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void ** size_t fsm_eager_output_count(const struct fsm *fsm, fsm_state_t state); -/* Get eager output associated with a state. It's expected that buf[] has - * sufficient space -- call fsm_eager_output_count first to get the count. - * The contents of buf will be sorted and unique. */ -void -fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, fsm_output_id_t *buf); +/* Get eager output IDs associated with a state, if any. + * id_buf is expected to have enough cells (according to id_buf_count) + * to store all the end IDs. You can find this with fsm_eager_output_count(). + * + * The IDs in the buffer are sorted and do not have duplicates. + * + * Unlike end IDs, eager outputs can appear on states that are + * not marked as end states. + * + * Returns 0 if there is not enough space in id_buf for the + * eager output IDs, or 1 if zero more IDs were returned. */ +int +fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, + size_t buf_count, fsm_output_id_t *id_buf); + +/* Get the end IDs associated with an end state, if any. + * id_buf is expected to have enough cells (according to id_buf_count) + * to store all the end IDs. You can find this with fsm_endid_count(). + * + * The end IDs in the buffer are sorted and do not have duplicates. + * + * A state with no end IDs set is considered equivalent to a state + * that has the empty set, this API does not distinguish these cases. + * This is not an error. + * + * It is an error to attempt to get end IDs associated with a state + * that is not marked as an end state. + * + * Returns 0 if there is not enough space in id_buf for the + * end IDs, or 1 if zero or more end IDs were returned. */ +int +fsm_endid_get(const struct fsm *fsm, fsm_state_t end_state, + size_t id_buf_count, fsm_end_id_t *id_buf); /* * Find the state (if there is just one), or add epsilon edges from all states, @@ -498,15 +538,6 @@ fsm_shortest(const struct fsm *fsm, fsm_state_t start, fsm_state_t goal, unsigned (*cost)(fsm_state_t from, fsm_state_t to, char c)); -/* HACK */ -typedef void -fsm_eager_endid_cb(fsm_end_id_t id, void *opaque); -void -fsm_eager_endid_set_cb(struct fsm *fsm, fsm_eager_endid_cb *cb, void *opaque); - -void -fsm_eager_endid_get_cb(const struct fsm *fsm, fsm_eager_endid_cb **cb, void **opaque); - /* * Execute an FSM reading input from the user-specified callback fsm_getc(). * fsm_getc() is passed the opaque pointer given, and is expected to return diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index 2161599ae..068aca1c2 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -179,7 +179,7 @@ static int copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { struct copy_eager_output_ids_env *env = opaque; - if (!fsm_seteageroutput(env->dst, state, id)) { + if (!fsm_eager_output_set(env->dst, state, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index b7a8905b2..55c3bfd64 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -294,7 +294,7 @@ consolidate_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opa assert(state < env->mapping_count); const fsm_state_t dst_state = env->mapping[state]; - if (!fsm_seteageroutput(env->dst, dst_state, id)) { + if (!fsm_eager_output_set(env->dst, dst_state, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 9833fd878..29a32d4ea 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -2599,7 +2599,7 @@ remap_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { (void)state; struct remap_eager_output_env *env = opaque; - if (!fsm_seteageroutput(env->dst, env->dst_state, id)) { + if (!fsm_eager_output_set(env->dst, env->dst_state, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c index 00fa1b5f0..04e067178 100644 --- a/src/libfsm/eager_output.c +++ b/src/libfsm/eager_output.c @@ -107,13 +107,13 @@ fsm_eager_output_free(struct fsm *fsm) } int -fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id) +fsm_eager_output_set_on_ends(struct fsm *fsm, fsm_output_id_t id) { assert(fsm != NULL); const size_t count = fsm_countstates(fsm); for (size_t i = 0; i < count; i++) { if (fsm_isend(fsm, i)) { - if (!fsm_seteageroutput(fsm, i, id)) { return 0; } + if (!fsm_eager_output_set(fsm, i, id)) { return 0; } } } return 1; @@ -157,7 +157,7 @@ grow_htab(const struct fsm_alloc *alloc, struct eager_output_htab *htab) } int -fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) +fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) { assert(fsm != NULL); @@ -296,7 +296,9 @@ fsm_eager_output_count(const struct fsm *fsm, fsm_state_t state) } struct get_env { + bool ok; size_t count; + size_t ceil; fsm_output_id_t *buf; }; @@ -305,6 +307,10 @@ append_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { struct get_env *env = opaque; (void)state; + if (env->count == env->ceil) { + env->ok = false; + return 0; + } env->buf[env->count++] = id; return 1; } @@ -317,12 +323,18 @@ cmp_fsm_output_id_t(const void *pa, const void *pb) return a < b ? -1 : a > b ? 1 : 0; } -void -fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, fsm_output_id_t *buf) +int +fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, + size_t buf_count, fsm_output_id_t *id_buf) { - struct get_env env = { .buf = buf }; + struct get_env env = { + .ok = true, + .buf = id_buf, + .ceil = buf_count, + }; fsm_eager_output_iter_state(fsm, state, append_cb, &env); - qsort(buf, env.count, sizeof(buf[0]), cmp_fsm_output_id_t); + qsort(id_buf, env.count, sizeof(id_buf[0]), cmp_fsm_output_id_t); + return env.ok ? 1 : 0; } void diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 8041c29d3..834d88cbf 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -246,7 +246,7 @@ fsm_remove_epsilons(struct fsm *nfa) } for (size_t i = 0; i < eager_output_buf.used; i++) { - if (!fsm_seteageroutput(nfa, s, eager_output_buf.ids[i])) { + if (!fsm_eager_output_set(nfa, s, eager_output_buf.ids[i])) { goto cleanup; } } diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index ab28b0a21..a498ced4d 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -100,10 +100,10 @@ fsm_increndids fsm_endid_dump -fsm_seteageroutput -fsm_seteageroutputonends +fsm_eager_output_set +fsm_eager_output_set_on_ends fsm_eager_output_count -# short term hack +fsm_eager_output_get_cb fsm_eager_output_set_cb fsm_eager_output_dump fsm_eager_output_get diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index ccc1568ff..267b5b1df 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -214,7 +214,7 @@ static int copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { struct copy_eager_output_ids_env *env = opaque; - if (!fsm_seteageroutput(env->dst, state + env->base_src, id)) { + if (!fsm_eager_output_set(env->dst, state + env->base_src, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/minimise_test_oracle.c b/src/libfsm/minimise_test_oracle.c index 20d4633a1..65bd3b43d 100644 --- a/src/libfsm/minimise_test_oracle.c +++ b/src/libfsm/minimise_test_oracle.c @@ -212,7 +212,7 @@ fsm_minimise_test_oracle(const struct fsm *fsm) int eres = fsm_endid_get(fsm, i, endid_count_a, ids_a); assert(eres == 1); - fsm_eager_output_get(fsm, i, eo_ids_a); + fsm_eager_output_get(fsm, i, max_eager_output_count, eo_ids_a); bool found = false; /* note: skipping eg 0 here since that's the empty set */ @@ -234,7 +234,7 @@ fsm_minimise_test_oracle(const struct fsm *fsm) endid_count_b, ids_b); assert(eres == 1); - fsm_eager_output_get(fsm, end_md_group_leaders[eg_i], eo_ids_b); + fsm_eager_output_get(fsm, end_md_group_leaders[eg_i], max_eager_output_count, eo_ids_b); if ((0 == memcmp(ids_a, ids_b, endid_count_a * sizeof(ids_a[0]))) && (0 == memcmp(eo_ids_a, eo_ids_b, eager_output_count_a * sizeof(eo_ids_a[0])))) { diff --git a/src/libfsm/print/ir.c b/src/libfsm/print/ir.c index a18dadbbc..d8e29b2e7 100644 --- a/src/libfsm/print/ir.c +++ b/src/libfsm/print/ir.c @@ -577,7 +577,7 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) if (outputs == NULL) { goto error; } - fsm_eager_output_get(fsm, i, outputs->ids); + fsm_eager_output_get(fsm, i, eager_output_count, outputs->ids); outputs->count = eager_output_count; ir->states[i].eager_outputs = outputs; diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 7e011af8e..c3e8dad81 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -708,7 +708,7 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t /* Set eager match ID on new eager_match_state. */ const fsm_output_id_t oid = (fsm_output_id_t)(id + id_base); - if (!fsm_seteageroutput(nfa, ainfo->eager_match_state, oid)) { + if (!fsm_eager_output_set(nfa, ainfo->eager_match_state, oid)) { return false; } if (log) { From eeea923ff91f5f0f9a3d349763153c9096152afd Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 19 Sep 2025 12:35:19 -0400 Subject: [PATCH 48/80] fuzz/target.c: Updates for interface changes. --- fuzz/target.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fuzz/target.c b/fuzz/target.c index 4ff8b63bd..fd59aeea4 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -505,8 +505,6 @@ fuzz_eager_output(const uint8_t *data, size_t size) } } - enum re_is_anchored_res anchorage[MAX_PATTERNS] = {0}; - /* for each pattern, attempt to compile to a DFA */ for (size_t p_i = 0; p_i < env.pattern_count; p_i++) { const char *p = env.patterns[p_i]; @@ -526,7 +524,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) } const fsm_output_id_t endid = (fsm_output_id_t)p_i; - ret = fsm_seteageroutputonends(fsm, endid); + ret = fsm_eager_output_set_on_ends(fsm, endid); assert(ret == 1); if (verbose) { @@ -575,7 +573,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) /* copy and combine fsms into one DFA */ { size_t used = 0; - struct fsm_union_entry entries[MAX_PATTERNS] = {0}; + struct fsm *nfas[MAX_PATTERNS] = {0}; for (size_t i = 0; i < env.fsm_count; i++) { /* there can be gaps, fsms[] lines up with patterns[] */ @@ -601,9 +599,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) } } - entries[used].fsm = cp; - entries[used].anchored_start = anchorage[i] & RE_IS_ANCHORED_START; - entries[used].anchored_end = anchorage[i] & RE_IS_ANCHORED_END; + nfas[used] = cp; used++; } @@ -611,8 +607,8 @@ fuzz_eager_output(const uint8_t *data, size_t size) goto cleanup; /* nothing to do */ } - /* consumes entries[] */ - struct fsm *fsm = fsm_union_repeated_pattern_group(used, entries, NULL); + /* consumes nfas[] */ + struct fsm *fsm = fsm_union_repeated_pattern_group(used, nfas, NULL, 0); assert(fsm != NULL); if (verbose) { From d6db021c79214858c488fb4c96345af0b9a4a12a Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 19 Sep 2025 13:20:01 -0400 Subject: [PATCH 49/80] Misc. cleanup before integration. --- src/libfsm/eager_output.c | 14 +------------- src/libfsm/eager_output.h | 12 ++++++++++++ src/libfsm/epsilons.c | 4 +--- src/libfsm/exec.c | 2 -- src/libfsm/state.c | 1 - .../eager_output_mixed_anchored_unanchored.c | 3 --- 6 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c index 04e067178..a6dcdc89f 100644 --- a/src/libfsm/eager_output.c +++ b/src/libfsm/eager_output.c @@ -172,15 +172,11 @@ fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) const uint64_t mask = info->htab.bucket_count - 1; assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ - /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { const size_t b_i = (hash + probes) & mask; struct eager_output_bucket *b = &info->htab.buckets[b_i]; - /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ - /* __func__, state, b_i, b->state, (void *)b->entry); */ struct eager_output_entry *e = b->entry; - if (e == NULL) { /* empty */ - /* add */ + if (e == NULL) { /* empty, add */ const size_t alloc_sz = sizeof(*e) + DEF_ENTRY_CEIL * sizeof(e->ids[0]); e = f_calloc(fsm->alloc, 1, alloc_sz); @@ -191,8 +187,6 @@ fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) b->state = state; b->entry = e; info->htab.buckets_used++; - /* fprintf(stderr, "%s: buckets_used %zd\n", __func__, info->htab.buckets_used); */ - /* fprintf(stderr, "%s: saved new entry in bucket %zd\n", __func__, b_i); */ } else if (b->state != state) { /* collision */ continue; } @@ -214,7 +208,6 @@ fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) } e->ids[e->used++] = id; - /* fprintf(stderr, "%s: e->ids_used %u\n", __func__, e->used); */ fsm->states[state].has_eager_outputs = 1; return 1; } @@ -259,8 +252,6 @@ fsm_eager_output_iter_state(const struct fsm *fsm, for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { const size_t b_i = (hash + probes) & mask; struct eager_output_bucket *b = &info->htab.buckets[b_i]; - /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ - /* __func__, state, b_i, b->state, (void *)b->entry); */ struct eager_output_entry *e = b->entry; if (e == NULL) { /* empty */ return; @@ -347,12 +338,9 @@ fsm_eager_output_iter_all(const struct fsm *fsm, struct eager_output_info *info = fsm->eager_output_info; - /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ for (size_t b_i = 0; b_i < info->htab.bucket_count; b_i++) { struct eager_output_bucket *b = &info->htab.buckets[b_i]; struct eager_output_entry *e = b->entry; - /* fprintf(stderr, "%s: b_i %zd, state %d, entry %p\n", */ - /* __func__, b_i, b->state, (void *)b->entry); */ if (e == NULL) { /* empty */ continue; } diff --git a/src/libfsm/eager_output.h b/src/libfsm/eager_output.h index 6093adc9e..b90da935c 100644 --- a/src/libfsm/eager_output.h +++ b/src/libfsm/eager_output.h @@ -1,3 +1,9 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + #ifndef EAGER_OUTPUT_H #define EAGER_OUTPUT_H @@ -13,12 +19,15 @@ fsm_eager_output_init(struct fsm *fsm); void fsm_eager_output_free(struct fsm *fsm); +/* Does an FSM have eager outputs? */ bool fsm_eager_output_has_eager_output(const struct fsm *fsm); +/* Does a particular state have eager outputs? */ bool fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state); +/* Dump eager outputs on an FSM. (For debugging.) */ void fsm_eager_output_dump(FILE *f, const struct fsm *fsm); @@ -28,14 +37,17 @@ fsm_eager_output_dump(FILE *f, const struct fsm *fsm); typedef int fsm_eager_output_iter_cb(fsm_state_t state, fsm_output_id_t id, void *opaque); +/* Iterate over eager outputs on a state. */ void fsm_eager_output_iter_state(const struct fsm *fsm, fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque); +/* Iterate over all eager outputs on an FSM. */ void fsm_eager_output_iter_all(const struct fsm *fsm, fsm_eager_output_iter_cb *cb, void *opaque); +/* Compact eager output metadata. */ int fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count); diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 834d88cbf..926e6d9bf 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -291,7 +291,7 @@ fsm_remove_epsilons(struct fsm *nfa) * reachable. This doesn't check that the FROM state is reachable from * the start state (trim will do that soon enough), it's just used to * check which states will become unreachable once epsilon edges are - * removed. We don't need to add eager endids for them, because they + * removed. We don't need to add eager outputs for them, because they * will soon be disconnected from the epsilon-free NFA. */ static void mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label) @@ -311,9 +311,7 @@ mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_lab struct fsm_state *s = &nfa->states[s_i]; /* Clear the visited flag, it will be used to avoid cycles. */ -#if 1 assert(s->visited == 0); /* stale */ -#endif s->visited = 0; edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index 077494b8f..d8cebd843 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -55,7 +55,6 @@ struct check_eager_outputs_for_state_env { static int match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) { - /* HACK update the types here once it's working */ (void)state; struct check_eager_outputs_for_state_env *env = opaque; #if LOG_EAGER @@ -68,7 +67,6 @@ match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaqu static int match_eager_outputs_for_state(const struct fsm *fsm, fsm_state_t state) { - /* HACK update the types here once it's working */ fsm_eager_output_cb *cb = NULL; void *opaque = NULL; fsm_eager_output_get_cb(fsm, &cb, &opaque); diff --git a/src/libfsm/state.c b/src/libfsm/state.c index 8f1146038..786bc3a25 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -90,7 +90,6 @@ fsm_addstate_bulk(struct fsm *fsm, size_t n) new->visited = 0; new->epsilons = NULL; new->edges = NULL; - new->has_eager_outputs = 0; } diff --git a/tests/eager_output/eager_output_mixed_anchored_unanchored.c b/tests/eager_output/eager_output_mixed_anchored_unanchored.c index 7afb272db..376b49d8a 100644 --- a/tests/eager_output/eager_output_mixed_anchored_unanchored.c +++ b/tests/eager_output/eager_output_mixed_anchored_unanchored.c @@ -2,9 +2,6 @@ int main(void) { - /* fprintf(stderr, "%s: skipping for now, this doesn't pass yet.\n", __FILE__); */ - /* return EXIT_SUCCESS; */ - struct eager_output_test test = { .patterns = { "^abc$", From 3293b7b95a464bcd11573e57b0a6241dc9995878 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 19 Sep 2025 13:40:26 -0400 Subject: [PATCH 50/80] Restore FORCE_ENDIDS behavior for tests/eager_output/eager_output7.c. This was removed in af27a87c, while updating for interface changes, but it's worth keeping this. It makes the performance difference between using endids and eager outputs for combining larger sets of unanchored patterns obvious -- on my current machine, `build/tests/eager_output/run7` completes in about 55 msec, whereas `env FORCE_ENDIDS=7 build/tests/eager_output/run7` takes about 88 msec, 8 takes 200 msec, 9 takes 639 msec, 10 takes 1.62 seconds, 11 takes 3.9 seconds, ... without FORCE_ENDIDS, it's running all 26 patterns. (FORCE_ENDIDS < 7 finishes faster, but that's because it's only using the first couple patterns and combining into a much smaller DFA.) Also, a minor bugfix -- instead of asserting that the outputs were >= the expected ID count, check that it's exactly equal. This was due to IDs set by both eager outputs and endids being counted twice; some of the eager_output_alt_mixing_anchored_and_unanchored.c cases could lead to this, due to the mixing of anchored and unanchored subtrees in the regexes. --- tests/eager_output/eager_output7.c | 29 +++++++++++++++++++++++++++ tests/eager_output/utils.c | 32 +++++++++++++++++++++++++++--- tests/eager_output/utils.h | 1 + 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/tests/eager_output/eager_output7.c b/tests/eager_output/eager_output7.c index 94e9f1787..e764ac946 100644 --- a/tests/eager_output/eager_output7.c +++ b/tests/eager_output/eager_output7.c @@ -2,7 +2,26 @@ int main(void) { + /* Run this test with env FORCE_ENDIDS=N ... to see how much more + * expensive it is to combine the first N patterns using endids, + * rather than eager_outputs. It becomes VERY slow for >= 9 or so. + * (Note that the checks probably will not pass for N < 4, because + * it will start skipping appear in the early test inputs.) */ + bool force_endids = false; + size_t force_endid_count = 0; + { + const char *str = getenv("FORCE_ENDIDS"); + if (str != NULL) { + force_endid_count = atoi(str); + if (force_endid_count == 0) { + force_endid_count = 26; + } + force_endids = true; + } + } + struct eager_output_test test = { + .force_endids = force_endids, .patterns = { [0] = "apple", [1] = "banana", @@ -71,5 +90,15 @@ int main(void) }, }; + /* truncate patterns to the first N */ + if (force_endids) { + assert(force_endid_count > 0 && force_endid_count <= 26); + test.patterns[force_endid_count] = NULL; + + /* truncate test inputs to just the first couple, since + * later inputs use later patterns */ + test.inputs[5].input = NULL; + } + return run_test(&test); } diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c index dfd2b952b..725da7e41 100644 --- a/tests/eager_output/utils.c +++ b/tests/eager_output/utils.c @@ -83,7 +83,16 @@ run_test(const struct eager_output_test *test) } const size_t id_base = 1; /* offset by 1 because 0 is used as end-of-list */ - struct fsm *fsm = fsm_union_repeated_pattern_group(nfas_used, nfas, NULL, id_base); + struct fsm *fsm; + if (test->force_endids) { + for (size_t i = 0; i < nfas_used; i++) { + fsm_setendid(nfas[i], i + id_base); + } + fsm = fsm_union_array(nfas_used, nfas, NULL); + } else { + /* This function sets the eager output IDs. */ + fsm = fsm_union_repeated_pattern_group(nfas_used, nfas, NULL, id_base); + } assert(fsm != NULL); if (log) { @@ -167,6 +176,7 @@ run_test(const struct eager_output_test *test) fsm_state_t end; /* only set on match */ ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + size_t match_id_count = 0; if (ret == 1) { #define ENDID_BUF_SIZE 32 fsm_end_id_t endid_buf[ENDID_BUF_SIZE] = {0}; @@ -177,10 +187,26 @@ run_test(const struct eager_output_test *test) assert(!"fsm_endid_get failed"); } + match_id_count += outputs.used; + for (size_t e_i = 0; e_i < endid_count; e_i++) { + fsm_end_id_t endid = endid_buf[e_i]; + bool found = false; + for (size_t o_i = 0; o_i < outputs.used; o_i++) { + if (outputs.ids[o_i] == endid) { + found = true; + break; + } + } + if (!found) { + /* Don't count IDs set by both endids AND eager outputs twice. */ + match_id_count++; + } + } + /* Copy endid outputs into outputs.ids[], since for testing * purposes we don't care about the difference between eager * output and endids here. */ - assert(outputs.used + endid_count <= MAX_IDS); + assert(match_id_count <= MAX_IDS); for (size_t endid_i = 0; endid_i < endid_count; endid_i++) { if (log) { fprintf(stderr, "-- adding endid %zd: %d\n", endid_i, endid_buf[endid_i]); @@ -222,7 +248,7 @@ run_test(const struct eager_output_test *test) assert(ret == 1); } - assert(outputs.used >= expected_id_count); + assert(match_id_count == expected_id_count); size_t floor = 0; for (size_t exp_i = 0; exp_i < outputs.used; exp_i++) { diff --git a/tests/eager_output/utils.h b/tests/eager_output/utils.h index 02f8427c9..ee5f941c5 100644 --- a/tests/eager_output/utils.h +++ b/tests/eager_output/utils.h @@ -32,6 +32,7 @@ struct eager_output_test { const char *patterns[MAX_PATTERNS]; + bool force_endids; struct { const char *input; From 255e426ca2672f5929e72e2f8ef864f85579fa1a Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 22 Sep 2025 10:53:32 -0400 Subject: [PATCH 51/80] fsm.h: Fix missing word in comment. --- include/fsm/fsm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index d57d43b1e..a811c16ed 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -278,7 +278,7 @@ fsm_increndids(struct fsm * fsm, int delta); * During DFA execution, states with eager outputs will output their * ID when output reaches them. With fsm_exec, this happens via a * callback (see fsm_eager_output_set_cb). Some print languages - * will eventually eager outputs. + * will eventually support eager outputs. * * One use case for eager outputs is combining multiple unanchored * regexes into a single DFA and detecting when input matches more than From 610f2a3c2d2aad4a32c65c46c8401ac0aaf85d2e Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 22 Sep 2025 11:02:15 -0400 Subject: [PATCH 52/80] Note why fsm_union_repeated_pattern_group depends on re_comp. This may not be necessary in the future, but currently analyze_group_nfa in union.c depends on knowing that re_comp will always assign the true unanchored start loop an earlier state ID than the unanchored end loop in order to properly link a few ambiguous cases. --- include/fsm/bool.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/fsm/bool.h b/include/fsm/bool.h index c2c2d80ed..25d456b55 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -57,8 +57,11 @@ fsm_union_array(size_t fsm_count, * eager outputs can match. Ownership of the NFAs is transferred, they will * be combined (or freed, if they don't have a start state). * - * This MUST be called with NFAs constructed via re_comp, Calling it with - * manually constructed NFAs or DFAs is unsupported. + * This MUST be called with NFAs constructed via re_comp, Calling it + * with manually constructed NFAs or DFAs may lead to incorrect loop + * linking, because in a few ambiguous cases (e.g. `$|^`) it relies on + * internal details of re_comp's normal construction to correctly + * identify the state representing the unanchored start loop. * * This will set end IDs and/or output IDs representing matching each * of the original NFAs on the combined result, where nfas[i] will From a32aff3cc8b1624d55a8389d26bff456ebc505be Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 28 Oct 2025 12:36:01 +0000 Subject: [PATCH 53/80] Stray comment. --- src/rx/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rx/main.c b/src/rx/main.c index 8fe40eb58..4ac357bc8 100644 --- a/src/rx/main.c +++ b/src/rx/main.c @@ -464,9 +464,6 @@ build_literals_fsm(bool show_stats, } } - /* We don't minimise here because this fsm has multiple endids, - * and the resulting FSM would be very similar to the current DFA */ - #ifndef NDEBUG /* * We could test to see that the fsm isn't any different. @@ -481,6 +478,9 @@ build_literals_fsm(bool show_stats, */ #endif + /* We don't minimise here because this fsm has multiple endids, + * and the resulting FSM would be very similar to the current DFA */ + return fsm; } From fcde3b700ab43ac66b0aebd5caf1f2dea8e297ce Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 16 Oct 2025 15:40:47 -0400 Subject: [PATCH 54/80] Optionally save linkage_info during NFA construction in re_comp. This adds a new flag for re_comp, RE_SAVE_LINKAGE_INFO. If that's set, then save information about the unanchored start and end loops, the end loop end, anchored starts, and anchored ends while building the NFA from the regex. Previously, this info was recovered by analyzing the NFA structure inside of fsm_union_repeated_pattern_group, but it seems less brittle to save those details during construction. This commit adds the flag, saves the info during construction, and after doing analysis it checks the results match. Since there is one case where it doesn't (eager_output_fr3.c), print a message and override it. This was actually a bug: that particular edge case leads to detecting the same state as both the unanchored start and end loops. In that specific case, it seems to be harmless, but this commit also adds an assert that the USL and UEL are not the same state. From now on, fsm_union_repeated_pattern_group depends on all of the NFAs passed to it having the linkage_info struct populated. They should come fresh from re_comp, without determinising or other transformations. I added an assert at the end of determinisation checking that the DFA does not have a linkage_info struct. Note: fsm_clone currently doesn't copy the linkage_info. Maybe it should. --- include/re/re.h | 3 ++ src/libfsm/determinise.c | 3 ++ src/libfsm/fsm.c | 7 ++++ src/libfsm/internal.h | 24 +++++++++++ src/libfsm/union.c | 84 +++++++++++++++++++++++++++++++++++++- src/libre/ast_compile.c | 46 ++++++++++++++++++++- tests/eager_output/utils.c | 2 +- 7 files changed, 166 insertions(+), 3 deletions(-) diff --git a/include/re/re.h b/include/re/re.h index 841e4e946..69551d39f 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -29,6 +29,9 @@ enum re_flags { RE_ANCHORED = 1 << 6, RE_EXTENDED = 1 << 7, /* PCRE extended mode */ RE_END_NL = 1 << 8, /* end anchor matches '\n' */ + /* save info about linkage at construction time, to inform + * later operations -- see fsm_union_repeated_pattern_group */ + RE_SAVE_LINKAGE_INFO = 1 << 9, RE_FLAGS_NONE = 0 }; diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 29a32d4ea..3f748aeae 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -280,6 +280,9 @@ fsm_determinise_with_config(struct fsm *nfa, assert(fsm_all(nfa, fsm_isdfa)); #endif + /* This should not be carried over from the NFA. */ + assert(nfa->linkage_info == NULL); + res = FSM_DETERMINISE_WITH_CONFIG_OK; cleanup: diff --git a/src/libfsm/fsm.c b/src/libfsm/fsm.c index c442c8262..866650a9c 100644 --- a/src/libfsm/fsm.c +++ b/src/libfsm/fsm.c @@ -42,6 +42,11 @@ free_contents(struct fsm *fsm) fsm_endid_free(fsm); fsm_eager_output_free(fsm); + if (fsm->linkage_info != NULL) { + state_set_free(fsm->linkage_info->anchored_starts); + state_set_free(fsm->linkage_info->anchored_ends); + f_free(fsm->alloc, fsm->linkage_info); + } f_free(fsm->alloc, fsm->states); } @@ -72,6 +77,7 @@ fsm_new_statealloc(const struct fsm_alloc *alloc, size_t statealloc) new->endcount = 0; new->capture_info = NULL; new->endid_info = NULL; + new->linkage_info = NULL; new->states = f_malloc(new->alloc, new->statealloc * sizeof *new->states); if (new->states == NULL) { @@ -144,6 +150,7 @@ fsm_move(struct fsm *dst, struct fsm *src) dst->capture_info = src->capture_info; dst->endid_info = src->endid_info; dst->eager_output_info = src->eager_output_info; + dst->linkage_info = src->linkage_info; f_free(src->alloc, src); } diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index 094723fdb..06658a78e 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -18,6 +18,7 @@ struct bm; struct edge_set; struct state_set; struct state_array; +struct linkage_info; /* * The alphabet (Sigma) for libfsm's FSM is arbitrary octets. @@ -80,6 +81,29 @@ struct fsm { struct fsm_capture_info *capture_info; struct endid_info *endid_info; struct eager_output_info *eager_output_info; + struct linkage_info *linkage_info; +}; + +#define LINKAGE_NO_STATE ((fsm_state_t)-1) + +/* Internal structure for storing structural info about an NFA. + * This is currently only used by fsm_union_repeated_pattern_group, + * which needs to identify a couple components of the NFA in order + * to link groups of repeated pattern together correctly. */ +struct linkage_info { + /* The states with a /./ self edge representing the unanchored + * start and end, or LINKAGE_NO_STATE. There can be at most one + * of each. */ + fsm_state_t unanchored_start_loop; + fsm_state_t unanchored_end_loop; + + /* The end state following the unanchored end loop. */ + fsm_state_t unanchored_end_loop_end; + + /* States that link to paths only reachable from the beginning of input. */ + struct state_set *anchored_starts; + /* States leading to an anchored end. */ + struct state_set *anchored_ends; }; struct fsm * diff --git a/src/libfsm/union.c b/src/libfsm/union.c index c3e8dad81..f4e7bd6e2 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -619,19 +619,91 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) } } + /* Compare/log the linkage info */ +#define COMPARE_LINKAGE_INFO 1 +#define LOG_LINKAGE_INFO 1 + if (LOG_LINKAGE_INFO) { + struct state_iter si; + state_set_reset(ainfo->anchored_starts, &si); + fsm_state_t s_i; + + fprintf(stderr, "ainfo->anchored_starts count: %zd\n", state_set_count(ainfo->anchored_starts)); + state_set_reset(ainfo->anchored_starts, &si); + while (state_set_next(&si, &s_i)) { + fprintf(stderr, "ainfo->anchored_starts: %d\n", s_i); + } + + fprintf(stderr, "linkage_info->anchored_starts count: %zd\n", state_set_count(nfa->linkage_info->anchored_starts)); + state_set_reset(nfa->linkage_info->anchored_starts, &si); + while (state_set_next(&si, &s_i)) { + fprintf(stderr, "linkage_info->anchored_starts: %d\n", s_i); + } + + assert(state_set_count(nfa->linkage_info->anchored_starts) >= state_set_count(ainfo->anchored_starts)); + state_set_reset(ainfo->anchored_starts, &si); + while (state_set_next(&si, &s_i)) { + assert(state_set_contains(nfa->linkage_info->anchored_starts, s_i)); + } + } + + if (LOG_LINKAGE_INFO) { + struct state_iter si; + state_set_reset(ainfo->anchored_ends, &si); + fsm_state_t s_i; + + fprintf(stderr, "ainfo->anchored_ends count: %zd\n", state_set_count(ainfo->anchored_ends)); + state_set_reset(ainfo->anchored_ends, &si); + while (state_set_next(&si, &s_i)) { + fprintf(stderr, "ainfo->anchored_ends: %d\n", s_i); + } + + fprintf(stderr, "linkage_info->anchored_ends count: %zd\n", state_set_count(nfa->linkage_info->anchored_ends)); + state_set_reset(nfa->linkage_info->anchored_ends, &si); + while (state_set_next(&si, &s_i)) { + fprintf(stderr, "linkage_info->anchored_ends: %d\n", s_i); + } + + assert(state_set_count(nfa->linkage_info->anchored_ends) >= state_set_count(ainfo->anchored_ends)); + state_set_reset(ainfo->anchored_ends, &si); + while (state_set_next(&si, &s_i)) { + assert(state_set_contains(nfa->linkage_info->anchored_ends, s_i)); + } + } + #if LOG_ANALYZE_GROUP_NFA_RESULTS { fprintf(stderr, "# analysis_info start %d, usl %d, uel %d, uele %d\n", ainfo->start, ainfo->unanchored_start_loop, ainfo->unanchored_end_loop, ainfo->unanchored_end_loop_end); dump_state_set(stderr, "anchored_ends", ainfo->anchored_ends); dump_state_set(stderr, "eager_matches", ainfo->eager_matches); - dump_edge_set(stderr, "anchored_firsts", ainfo->anchored_start, ainfo->anchored_firsts); dump_edge_set(stderr, "repeatable_firsts", ainfo->unanchored_start_loop, ainfo->repeatable_firsts); } #endif closure_free(nfa, eclosures, state_count); + if (COMPARE_LINKAGE_INFO) { + /* Check that the analysis and saved linkage_info from ast_compile.c match */ + fprintf(stderr, "%s: checking that build-time data matches... usl %d, %d; uel %d, %d; uele %d, %d\n", + __func__, + nfa->linkage_info->unanchored_start_loop, ainfo->unanchored_start_loop, + nfa->linkage_info->unanchored_end_loop, ainfo->unanchored_end_loop, + nfa->linkage_info->unanchored_end_loop_end, ainfo->unanchored_end_loop_end); + + if (nfa->linkage_info->unanchored_start_loop != ainfo->unanchored_start_loop) { + fprintf(stderr, "DISAGREEMENT, overriding\n"); + ainfo->unanchored_start_loop = nfa->linkage_info->unanchored_start_loop; + } + + assert(nfa->linkage_info->unanchored_start_loop == ainfo->unanchored_start_loop); + assert(nfa->linkage_info->unanchored_end_loop == ainfo->unanchored_end_loop); + assert(nfa->linkage_info->unanchored_end_loop_end == ainfo->unanchored_end_loop_end); + } + + /* The unanchored start and end loop cannot be the same state. */ + assert(nfa->linkage_info->unanchored_start_loop == NO_STATE + || nfa->linkage_info->unanchored_start_loop != nfa->linkage_info->unanchored_end_loop); + return true; alloc_fail: @@ -869,6 +941,16 @@ fsm_union_repeated_pattern_group(size_t nfa_count, errno = EINVAL; return NULL; } + + /* Any NFAs passed to this function must be built with + * an re_comp flag of RE_SAVE_LINKAGE_INFO, because some + * of the info saved during construction informs + * linking. */ + if (nfas[i]->linkage_info == NULL) { + errno = EINVAL; + return NULL; + } + const size_t count = fsm_countstates(nfas[i]); est_total_states += count; } diff --git a/src/libre/ast_compile.c b/src/libre/ast_compile.c index 0ea0b08d8..5d31ef0a2 100644 --- a/src/libre/ast_compile.c +++ b/src/libre/ast_compile.c @@ -20,6 +20,8 @@ #include +#include + #include "class.h" #include "ast.h" #include "ast_compile.h" @@ -258,6 +260,10 @@ intern_start_any_loop(struct comp_env *env) env->start_any_loop = loop; env->has_start_any_loop = 1; + if (env->fsm->linkage_info) { + env->fsm->linkage_info->unanchored_start_loop = loop; + } + return 1; } @@ -290,6 +296,11 @@ intern_end_any_loop(struct comp_env *env) env->end_any_loop = loop; env->has_end_any_loop = 1; + if (env->fsm->linkage_info != NULL) { + env->fsm->linkage_info->unanchored_end_loop = loop; + env->fsm->linkage_info->unanchored_end_loop_end = env->end; + } + return 1; } @@ -327,6 +338,14 @@ intern_end_nl(struct comp_env *env) env->end_nl = end_nl; env->has_end_nl = 1; + + if (env->fsm->linkage_info != NULL) { + if (!state_set_add(&env->fsm->linkage_info->anchored_ends, + env->fsm->alloc, env->end)) { + return 0; + } + } + return 1; } @@ -718,7 +737,7 @@ comp_iter(struct comp_env *env, } #if LOG_LINKAGE - fprintf(stderr, " ---> x: %d, y: %d\n", x, y); + fprintf(stderr, " ---> x: %d, y: %d, type: %s\n", x, y, ast_node_type_name(n->type)); #endif switch (n->type) { @@ -871,6 +890,20 @@ comp_iter(struct comp_env *env, case AST_EXPR_ANCHOR: EPSILON(x, y); + + if (env->fsm->linkage_info != NULL + && x == env->start + && n->u.anchor.type == AST_ANCHOR_START) { + /* This state is directly linked from the global start. */ +#if LOG_LINKAGE + fprintf(stderr, "%s: adding %d to anchored_starts due to start anchor\n", + __func__, y); +#endif + if (!state_set_add(&env->fsm->linkage_info->anchored_starts, + env->fsm->alloc, y)) { + return 0; + } + } break; case AST_EXPR_SUBTRACT: { @@ -973,6 +1006,17 @@ ast_compile(const struct ast *ast, return NULL; } + if (re_flags & RE_SAVE_LINKAGE_INFO) { + struct linkage_info *li = f_malloc(alloc, sizeof(*fsm->linkage_info)); + if (li == NULL) { goto error; } + li->unanchored_start_loop = LINKAGE_NO_STATE; + li->unanchored_end_loop = LINKAGE_NO_STATE; + li->unanchored_end_loop_end = LINKAGE_NO_STATE; + li->anchored_starts = NULL; + li->anchored_ends = NULL; + fsm->linkage_info = li; + } + if (!fsm_addstate(fsm, &x)) { goto error; } diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c index 725da7e41..d1ff4f7b4 100644 --- a/tests/eager_output/utils.c +++ b/tests/eager_output/utils.c @@ -65,7 +65,7 @@ run_test(const struct eager_output_test *test) const char *p = test->patterns[i]; if (test->patterns[i] == NULL) { break; } - struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, RE_SAVE_LINKAGE_INFO, NULL); assert(fsm != NULL); if (log) { From 9971fe126f6ecb8a41dac546ce224bdf7767972f Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 5 Nov 2025 10:27:58 -0500 Subject: [PATCH 55/80] Copy some fields from linkage_info, remove analysis. They need to be copied over (transferring ownership of the state sets) because the linkage_info struct gets freed during the call to fsm_merge. This removes some code that tended to reconstruct the same information by analyzing the NFA structure, but was significantly more compliacted, and in at least one case it led to the wrong result (albeit harmlessly). It's simpler to just save the relevant state IDs during NFA construction. --- src/libfsm/union.c | 288 +++++---------------------------------------- 1 file changed, 28 insertions(+), 260 deletions(-) diff --git a/src/libfsm/union.c b/src/libfsm/union.c index f4e7bd6e2..409553d1d 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -41,17 +41,19 @@ struct analysis_info { fsm_state_t start; /* start state */ /* The states with a /./ self edge representing the unanchored - * start and end, or NO_STATE. There can be at most one of each. */ + * start and end, or LINKAGE_NO_STATE. There can be at most one + * of each. Copied from linkage_info. */ fsm_state_t unanchored_start_loop; fsm_state_t unanchored_end_loop; - /* The end state following the unanchored end loop. */ + /* The end state following the unanchored end loop. + * Copied from linkage_info.*/ fsm_state_t unanchored_end_loop_end; - /* States that link to paths only reachable from the beginning of input. */ + /* States that link to paths only reachable from the beginning of input. + * Copied from linkage_info. */ struct state_set *anchored_starts; - - /* States leading to an anchored end. */ + /* States leading to an anchored end. Copied from linkage_info. */ struct state_set *anchored_ends; /* States with an outgoing labeled edge to the unanchored end loop. Input @@ -213,25 +215,6 @@ fsm_union_array(size_t fsm_count, return res; } -static bool -state_has_dot_self_edge(const struct fsm *nfa, fsm_state_t s_i) -{ - const struct fsm_state *s = &nfa->states[s_i]; - - struct edge_group_iter ei; - edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &ei); - struct edge_group_iter_info info; - while (edge_set_group_iter_next(&ei, &info)) { - if (info.to != s_i) { continue; } - for (size_t i = 0; i < 256/64; i++) { - if (info.symbols[i] != (uint64_t)-1) { continue; } - } - return true; - } - - return false; -} - #if LOG_ANALYZE_GROUP_NFA_RESULTS static void dump_state_set(FILE *f, const char *name, const struct state_set *set) @@ -264,63 +247,6 @@ dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set } #endif -/* For each state in the epsilon closure, if there's a labeled edge - * to an end state with no outgoing edges, check if the label set is - * only [\n] and there's also an epsilon edge to the same end state. - * If so, this represents an anchored end in the NFA. */ -static bool -state_has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, - fsm_state_t s_id, struct state_set *s_eclosure, fsm_state_t *dst_end) -{ - struct state_iter si; - state_set_reset(s_eclosure, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - assert(ns_i < nfa->statecount); - const struct fsm_state *ns = &nfa->states[ns_i]; - - if (state_set_empty(ns->epsilons)) { continue; } - if (edge_set_empty(ns->edges)) { continue; } - - struct edge_group_iter iter; - struct edge_group_iter_info info; - edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - /* Look for an edge set with only '\n' */ - if ((info.symbols[0] != (1ULL << '\n')) - || info.symbols[1] || info.symbols[2] || info.symbols[3]) { - continue; - } - - /* If it's an end, look for an epsilon leeding to the same destination */ - if (fsm_isend(nfa, info.to)) { - assert(info.to < nfa->statecount); - const struct fsm_state *end_candidate = &nfa->states[info.to]; - if (!state_set_empty(end_candidate->epsilons) || - !edge_set_empty(end_candidate->edges)) { - continue; /* not an anchored end */ - } - - struct state_iter inner_si; - fsm_state_t os_i; - - assert(s_id < nfa->statecount); - const struct fsm_state *s = &nfa->states[s_id]; - - state_set_reset(s->epsilons, &inner_si); - while (state_set_next(&inner_si, &os_i)) { - if (os_i == info.to) { - *dst_end = info.to; - return true; - } - } - } - } - } - - return false; -} - static bool state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa, fsm_state_t s_i, struct state_set **eclosures, @@ -394,11 +320,7 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) } } - memset(ainfo, 0x00, sizeof(*ainfo)); ainfo->start = NO_STATE; - ainfo->unanchored_start_loop = NO_STATE; - ainfo->unanchored_end_loop = NO_STATE; - ainfo->unanchored_end_loop_end = NO_STATE; ainfo->eager_match_state = NO_STATE; if (!fsm_getstart(nfa, &ainfo->start)) { @@ -445,80 +367,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) } } - /* Iterate over the start state's epsilon edges, attempting to - * identify the unanchored start loop and anchored start states - * (if present). - * - * Note: This uses the start state's epsilon set rather than its - * epsilon closure because (by construction) the unanchored - * start loop and anchored start states will both be directly - * connected to the start state. Using the epsilon closure can - * mis-identify the unanchored *end* loop as the start loop, if - * there is a path with only epsilon edges between them. */ - struct state_iter si; - state_set_reset(nfa->states[ainfo->start].epsilons, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - /* Ignore self edges. */ - if (ns_i == ainfo->start) { continue; } - - /* If there's a state in the start state's epsilon set that - * has a dot self-edge, it's the unanchored start loop. */ - if (state_has_dot_self_edge(nfa, ns_i)) { - if (LOG_ANALYZE_GROUP_NFA) { - fprintf(stderr, "%s: unanchored_start_loop found on state %d\n", __func__, ns_i); - } - - /* By construction, the true unanchored start loop is only reachable - * via an epsilon edge from the start state, so if any other state - * has an epsilon or labeled edge to this one, it cannot be the - * unanchored start loop. - * - * This is necessary for cases like '^|x', which produces: - * - * 0 -> 2; - * 0 -> 3; - * 2 -> 2 "\x00" .. "\xff"; - * 2 -> 3 "x"; - * 3 -> 1; - * 3 -> 3 ?; - * - * start: 0; - * end: 1 = [0]; - * - * where this analysis would otherwise identify both 2 (correct) - * and 3 (incorrect) as the unanchored start loop. Both are reachable - * from the start state via an epsilon edge, but the labeled edge - * 2->3 'x' rules 3 out. - * */ - if (state_set_contains(ainfo->reachable_from_nonstart_state, ns_i)) { - continue; - } - - /* The reachable_from_nonstart_state check handles the other cases, - * but for `$|^` other attempts to distinguish them will fail, - * but by construction the USL will have the earlier state ID. */ - if (ainfo->unanchored_start_loop != NO_STATE && - ainfo->unanchored_start_loop < ns_i) { - continue; - } - - ainfo->unanchored_start_loop = ns_i; - continue; - } else { - /* Otherwise, a state without a dot self-edge is an anchored start. - * There may be more than one. */ - if (LOG_ANALYZE_GROUP_NFA) { - fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i); - } - - if (!state_set_add(&ainfo->anchored_starts, nfa->alloc, ns_i)) { - goto alloc_fail; - } - continue; - } - } - /* Copy labeled edges from the unanchored start loop and * its epsilon closure to ainfo->repeatable_firsts, except * for edges leading back to the unanchored start loop. */ @@ -576,35 +424,8 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) * trivially match, but otherwise it would never match. */ ainfo->nullable = start_state_epsilon_closure_matches_empty_string(nfa, eclosures[ainfo->start]); - /* If there's a state with a dot self-edge and an epsilon edge to an end state, it's - * the unanchored end loop. There should only be one. */ - for (size_t s_i = 0; s_i < state_count; s_i++) { - const struct fsm_state *s = &nfa->states[s_i]; - if (state_has_dot_self_edge(nfa, s_i)) { - struct state_iter si; - state_set_reset(s->epsilons, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - if (fsm_isend(nfa, ns_i)) { - assert(ainfo->unanchored_end_loop == NO_STATE); - ainfo->unanchored_end_loop = s_i; - ainfo->unanchored_end_loop_end = ns_i; - break; - } - } - if (ainfo->unanchored_end_loop != NO_STATE) { break; } - } - } - /* Collect states that lead to an anchored end or eager match. */ for (size_t s_i = 0; s_i < state_count; s_i++) { - fsm_state_t dst_end = NO_STATE; - if (state_has_epsilon_and_newline_edges_to_same_end(nfa, s_i, eclosures[s_i], &dst_end)) { - if (!state_set_add(&ainfo->anchored_ends, nfa->alloc, dst_end)) { - goto alloc_fail; - } - } - fsm_state_t indirect_dst = NO_STATE; if (state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, s_i, eclosures, ainfo->unanchored_end_loop, &indirect_dst)) { if (!state_set_add(&ainfo->eager_matches, nfa->alloc, s_i)) { @@ -619,57 +440,6 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) } } - /* Compare/log the linkage info */ -#define COMPARE_LINKAGE_INFO 1 -#define LOG_LINKAGE_INFO 1 - if (LOG_LINKAGE_INFO) { - struct state_iter si; - state_set_reset(ainfo->anchored_starts, &si); - fsm_state_t s_i; - - fprintf(stderr, "ainfo->anchored_starts count: %zd\n", state_set_count(ainfo->anchored_starts)); - state_set_reset(ainfo->anchored_starts, &si); - while (state_set_next(&si, &s_i)) { - fprintf(stderr, "ainfo->anchored_starts: %d\n", s_i); - } - - fprintf(stderr, "linkage_info->anchored_starts count: %zd\n", state_set_count(nfa->linkage_info->anchored_starts)); - state_set_reset(nfa->linkage_info->anchored_starts, &si); - while (state_set_next(&si, &s_i)) { - fprintf(stderr, "linkage_info->anchored_starts: %d\n", s_i); - } - - assert(state_set_count(nfa->linkage_info->anchored_starts) >= state_set_count(ainfo->anchored_starts)); - state_set_reset(ainfo->anchored_starts, &si); - while (state_set_next(&si, &s_i)) { - assert(state_set_contains(nfa->linkage_info->anchored_starts, s_i)); - } - } - - if (LOG_LINKAGE_INFO) { - struct state_iter si; - state_set_reset(ainfo->anchored_ends, &si); - fsm_state_t s_i; - - fprintf(stderr, "ainfo->anchored_ends count: %zd\n", state_set_count(ainfo->anchored_ends)); - state_set_reset(ainfo->anchored_ends, &si); - while (state_set_next(&si, &s_i)) { - fprintf(stderr, "ainfo->anchored_ends: %d\n", s_i); - } - - fprintf(stderr, "linkage_info->anchored_ends count: %zd\n", state_set_count(nfa->linkage_info->anchored_ends)); - state_set_reset(nfa->linkage_info->anchored_ends, &si); - while (state_set_next(&si, &s_i)) { - fprintf(stderr, "linkage_info->anchored_ends: %d\n", s_i); - } - - assert(state_set_count(nfa->linkage_info->anchored_ends) >= state_set_count(ainfo->anchored_ends)); - state_set_reset(ainfo->anchored_ends, &si); - while (state_set_next(&si, &s_i)) { - assert(state_set_contains(nfa->linkage_info->anchored_ends, s_i)); - } - } - #if LOG_ANALYZE_GROUP_NFA_RESULTS { fprintf(stderr, "# analysis_info start %d, usl %d, uel %d, uele %d\n", @@ -682,27 +452,9 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) closure_free(nfa, eclosures, state_count); - if (COMPARE_LINKAGE_INFO) { - /* Check that the analysis and saved linkage_info from ast_compile.c match */ - fprintf(stderr, "%s: checking that build-time data matches... usl %d, %d; uel %d, %d; uele %d, %d\n", - __func__, - nfa->linkage_info->unanchored_start_loop, ainfo->unanchored_start_loop, - nfa->linkage_info->unanchored_end_loop, ainfo->unanchored_end_loop, - nfa->linkage_info->unanchored_end_loop_end, ainfo->unanchored_end_loop_end); - - if (nfa->linkage_info->unanchored_start_loop != ainfo->unanchored_start_loop) { - fprintf(stderr, "DISAGREEMENT, overriding\n"); - ainfo->unanchored_start_loop = nfa->linkage_info->unanchored_start_loop; - } - - assert(nfa->linkage_info->unanchored_start_loop == ainfo->unanchored_start_loop); - assert(nfa->linkage_info->unanchored_end_loop == ainfo->unanchored_end_loop); - assert(nfa->linkage_info->unanchored_end_loop_end == ainfo->unanchored_end_loop_end); - } - /* The unanchored start and end loop cannot be the same state. */ - assert(nfa->linkage_info->unanchored_start_loop == NO_STATE - || nfa->linkage_info->unanchored_start_loop != nfa->linkage_info->unanchored_end_loop); + assert(ainfo->unanchored_start_loop == NO_STATE + || ainfo->unanchored_start_loop != ainfo->unanchored_end_loop); return true; @@ -912,8 +664,6 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) static void free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) { - state_set_free(ainfo->anchored_ends); - state_set_free(ainfo->anchored_starts); state_set_free(ainfo->eager_matches); state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state); state_set_free(ainfo->reachable_from_nonstart_state); @@ -958,8 +708,26 @@ fsm_union_repeated_pattern_group(size_t nfa_count, for (size_t i = 0; i < nfa_count; i++) { struct fsm *fsm = nfas[i]; + struct analysis_info *ainfo = &ainfos[i]; + + /* Copy these fields over, because fsm->linkage_info will be + * freed during the call to fsm_merge below. */ + { + struct linkage_info *linkage_info = fsm->linkage_info; + + ainfo->unanchored_start_loop = linkage_info->unanchored_start_loop; + ainfo->unanchored_end_loop = linkage_info->unanchored_end_loop; + ainfo->unanchored_end_loop_end = linkage_info->unanchored_end_loop_end; + + /* Transfer ownership of these. */ + ainfo->anchored_starts = linkage_info->anchored_starts; + linkage_info->anchored_starts = NULL; + ainfo->anchored_ends = linkage_info->anchored_ends; + linkage_info->anchored_ends = NULL; + } + /* Identify various states in the NFA that will be relevant to combining. */ - if (!analyze_group_nfa(fsm, &ainfos[i])) { + if (!analyze_group_nfa(fsm, ainfo)) { goto fail; } From 50a5fe76b5cc06b77eb7fd915aaaf14622ed7912 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 5 Nov 2025 10:52:00 -0500 Subject: [PATCH 56/80] Restore freeing of state sets copied from linkage_info. --- src/libfsm/union.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 409553d1d..4b07ca6cf 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -664,6 +664,8 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) static void free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) { + state_set_free(ainfo->anchored_starts); + state_set_free(ainfo->anchored_ends); state_set_free(ainfo->eager_matches); state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state); state_set_free(ainfo->reachable_from_nonstart_state); From c65d68c9d7802f5b18c6bf7cfa953ad8c46ad3e8 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 20 Nov 2025 16:12:41 -0500 Subject: [PATCH 57/80] Update stale comment. This wasn't updated to mention RE_SAVE_LINKAGE_INFO. --- include/fsm/bool.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/fsm/bool.h b/include/fsm/bool.h index 25d456b55..150d63f5d 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -57,11 +57,12 @@ fsm_union_array(size_t fsm_count, * eager outputs can match. Ownership of the NFAs is transferred, they will * be combined (or freed, if they don't have a start state). * - * This MUST be called with NFAs constructed via re_comp, Calling it - * with manually constructed NFAs or DFAs may lead to incorrect loop - * linking, because in a few ambiguous cases (e.g. `$|^`) it relies on - * internal details of re_comp's normal construction to correctly - * identify the state representing the unanchored start loop. + * This must be called with NFAs constructed via re_comp, using its + * RE_SAVE_LINKAGE_INFO flag. That saves details during construction + * that are necessary to correctly handle anchoring while linking + * them into the combined NFA. If any of the NFAs do not have that + * information populated, the whole set will be rejected and it + * will return NULL. * * This will set end IDs and/or output IDs representing matching each * of the original NFAs on the combined result, where nfas[i] will From 8debf240a40242333e177377e36c4912818d8ab8 Mon Sep 17 00:00:00 2001 From: Grace Susanto Date: Tue, 25 Nov 2025 14:02:12 +0000 Subject: [PATCH 58/80] Add documentation guide on how to use Libfsm effectively --- README.md | 6 ++ doc/GUIDE.md | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 doc/GUIDE.md diff --git a/README.md b/README.md index 545725fe9..d72b4fece 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,12 @@ lx is an attempt to produce a simple, expressive, and unobtrusive lexer generator which is good at lexing, does just lexing, is language independent, and has no other features. +### Performance and Requirements + +libfsm is not a drop-in replacement for other regex engines and it only supports patterns that can be compiled to deterministic FSMs. In return, supported patterns run in linear time. + +**→ See [doc/GUIDE.md](doc/GUIDE.md) for detailed unsupported features, usage patterns, and examples.** + ### Building from source Clone with submodules (contains required .mk files): diff --git a/doc/GUIDE.md b/doc/GUIDE.md new file mode 100644 index 000000000..3fe458c41 --- /dev/null +++ b/doc/GUIDE.md @@ -0,0 +1,186 @@ +# Using libfsm for High-Performance Pattern Matching + +libfsm compiles regular expressions to deterministic finite state machines (FSMs) and generates executable code. FSM-based matching runs in **linear time O(n)** with **no backtracking**. + +**libfsm is not a drop-in replacement for traditional regex engines.** It only supports patterns that can be compiled to FSMs. + +## What libfsm Cannot Do + +These PCRE features will not compile: + +* Word boundaries (`\b`) +* Non-greedy quantifiers (`*?`, `+?`, `??`) +* Group capture and backreferences +* Lookahead/lookbehind assertions (`(?=`, `(?!`, `(?<=`, `(? user_detector.go +``` + +This produces a standalone matcher function. + +--- + +## Supported Code Generation Targets + +libfsm provides stable, “first-class” code generation for: + +| Category | Output | +| -------------------- | ------------------------------ | +| High-level languages | **C (via `-l vmc`), Go, Rust** | +| Toolchains | **LLVM IR** | +| Virtualization | **Native WebAssembly** | + +> Adding code generation for new languages is template-driven and straightforward. + +--- + +## Workflow Overview + +libfsm provides two main tools: **`re`** takes patterns from command line, **`rx`** takes patterns from file. + +### 1. Validate the Regex + +Test behavior using any PCRE-compatible tool (e.g., [https://regex101.com/](https://regex101.com/)). + +### 2. Verify libfsm Compatibility + +```bash +re -r pcre -l ast 'x*?' +# Output: /x*?/:3: Unsupported operator + +rx -r pcre -l ast -d declined.txt 'x*?' +# Unsupported character in declined.txt +``` + +If unsupported constructs exist, libfsm reports the failing location. + +### 3. Generate Code + +```bash +re -p -r pcre -l rust -k str '^item-[A-Z]{3}$' > item_detector.rs +``` + +### 4. Multiple Patterns + +```bash +# re - patterns from command line: +re -p -r pcre -l go -k str '^x?a b+c$' '^x*def?$' '^x$' + +# rx - patterns from file: +rx -p -r pcre -l vmc -k str -d skipped.txt patterns.txt > detectors.c +``` + +Both tools: +* Combine all patterns into one function (like using `|` to join them) +* Return `(bool, int)` - match status and pattern ID +* Pattern ID is argument position for `re`, line number for `rx` +* When encountering unsupported patterns: `rx` skips them to `-d` file and generates code with working patterns; `re` fails completely + +--- + +### Flag Reference +| Flag | Purpose | Common Options | Notes | +| ---- | --------------------------- | ------------------------------------------ | ------------------------------------------ | +| `-r` | Select regex dialect | `pcre`, `literal`, `glob`, `native`, `sql` | `pcre` supports the widest set of features | +| `-l` | Choose output language | `go`, `rust`, `vmc`, `llvm`, `wasm`, `dot` | Use `vmc` for `C` code, pipe `dot` into `idot` for visualization | +| `-k` | Generated function I/O API | `str`, `getc`, `pair` | `str` takes string, `pair` takes byte array, `getc` uses callback for streaming | +| `-p` | Production mode | *(no value)* | Generates optimized code | +| `-d` | Output unsupported patterns | filename | Only applies to `rx` (batch mode) | + +For more detailed information on flags, see [include/fsm/options.h](../include/fsm/options.h) and the man pages (by running `build/man/re.1/re.1` after `bmake doc`). + +--- + +## Writing Effective libfsm Patterns + +For additional regex best practices, see [Fastly's regex guide](https://www.fastly.com/documentation/reference/vcl/regex/#best-practices-and-common-mistakes). + +### 1. Replace Broad Wildcards + +Avoid `.*` whenever possible. Use negated character classes: + +| Avoid | Better | +| ---------- | -------------- | +| `<.*>` | `<[^>]*>` | +| `\((.*)\)` | `\([^)]*\)` | +| `price=.*` | `price=[0-9]+` | + +--- + +### 2. Anchor When You Require Full Matches + +FSMs only do what’s specified. Explicitly anchor when matching entire strings: + +```regex +^task-[a-z]+-[0-9]{2}\z +``` + +Use `\z` for end-of-string. + +--- + +## Byte Search Optimization (Optional) + +Patterns that start with an **uncommon character** can be accelerated using an initial byte scan before running the FSM. +This quickly jumps to likely match positions instead of scanning every byte. + +### Common fast byte search APIs + +| Language | Function | +| -------- | -------------------------- | +| Go | `strings.IndexByte` | +| Rust | `memchr::memchr` | +| C | `memchr` from `` | + + +### Good candidates + +Patterns that always start with uncommon prefix characters, for example: + +``` +#tag-[a-z]+ +@user-[0-9]+ +\[section\] +{"key": +"name='[^']+'" +``` + +These prefixes (`#`, `@`, `[`, `{`, `'`, `"`) rarely appear in normal text, making a byte search highly effective. + +--- + +## Troubleshooting + +### Pattern Matches Empty String Unintentionally + +Pattern: + +```regex +\s* +``` + +Will compile to code that always returns true. + +**Fix options:** + +* Require at least one match: `\s+` +* Anchor context: `^\s+$` +* Or alternatively, use `-Fb` flag + +### Compilation Takes Too Long + +Likely caused by unrestricted wildcards (`.*`, `.+`). Fix with: + +* Negated classes (`[^)]*`) +* Bounded repeats (`{0,50}`) +* Pattern splitting From 7306570e3b1556c9ab8a216463753f893797ec51 Mon Sep 17 00:00:00 2001 From: Grace Susanto Date: Thu, 27 Nov 2025 12:16:28 +0000 Subject: [PATCH 59/80] Update docs based on reviews --- doc/GUIDE.md | 156 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 117 insertions(+), 39 deletions(-) diff --git a/doc/GUIDE.md b/doc/GUIDE.md index 3fe458c41..ff7abbc7d 100644 --- a/doc/GUIDE.md +++ b/doc/GUIDE.md @@ -2,15 +2,33 @@ libfsm compiles regular expressions to deterministic finite state machines (FSMs) and generates executable code. FSM-based matching runs in **linear time O(n)** with **no backtracking**. +> Regex engines like PCRE use backtracking to explore multiple possible match paths at **runtime**. +> This means the same pattern can have different execution costs depending on the input. +> +> libfsm instead resolves all match decisions at **compile time** by constructing a Deterministic Finite Automaton (DFA). +> At runtime, matching is a single linear pass over the input with no alternative paths to explore. +> +> As a result, libfsm avoids input-dependent slowdowns and is not susceptible to regular expression–based denial-of-service (ReDoS) attacks. + **libfsm is not a drop-in replacement for traditional regex engines.** It only supports patterns that can be compiled to FSMs. +### **Topics** + +- [What libfsm Cannot Do](#what-libfsm-cannot-do) +- [Quick Start](#quick-start) +- [Supported Code Generation Targets](#supported-code-generation-targets) +- [Workflow Overview](#workflow-overview) +- [Writing Effective libfsm Patterns](#writing-effective-libfsm-patterns) +- [Byte Search Optimization (Optional)](#byte-search-optimization-optional) +- [Troubleshooting](#troubleshooting) + ## What libfsm Cannot Do These PCRE features will not compile: * Word boundaries (`\b`) * Non-greedy quantifiers (`*?`, `+?`, `??`) -* Group capture and backreferences +* Group capture (coming soon!) and backreferences * Lookahead/lookbehind assertions (`(?=`, `(?!`, `(?<=`, `(? Adding code generation for new languages is template-driven and straightforward. +> Adding code generation for new languages is straightforward and is defined in [src/libfsm/print/](../src/libfsm/print/). --- ## Workflow Overview -libfsm provides two main tools: **`re`** takes patterns from command line, **`rx`** takes patterns from file. +libfsm provides two main tools: + - **`re`** takes patterns from command line + - **`rx`** takes patterns from file + +A recommended workflow when using libfsm is: ### 1. Validate the Regex -Test behavior using any PCRE-compatible tool (e.g., [https://regex101.com/](https://regex101.com/)). +Test behavior using any PCRE-compatible tool (e.g., [pcregrep(1)](https://man7.org/linux/man-pages/man1/pcregrep.1.html) on the CLI or [https://regex101.com/](https://regex101.com/) in the browser). ### 2. Verify libfsm Compatibility ```bash re -r pcre -l ast 'x*?' # Output: /x*?/:3: Unsupported operator +# :3 indicates that the character at offset 3 in the pattern is rejected. rx -r pcre -l ast -d declined.txt 'x*?' # Unsupported character in declined.txt @@ -67,7 +90,7 @@ If unsupported constructs exist, libfsm reports the failing location. ### 3. Generate Code ```bash -re -p -r pcre -l rust -k str '^item-[A-Z]{3}$' > item_detector.rs +re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs ``` ### 4. Multiple Patterns @@ -89,63 +112,112 @@ Both tools: --- ### Flag Reference -| Flag | Purpose | Common Options | Notes | -| ---- | --------------------------- | ------------------------------------------ | ------------------------------------------ | -| `-r` | Select regex dialect | `pcre`, `literal`, `glob`, `native`, `sql` | `pcre` supports the widest set of features | -| `-l` | Choose output language | `go`, `rust`, `vmc`, `llvm`, `wasm`, `dot` | Use `vmc` for `C` code, pipe `dot` into `idot` for visualization | -| `-k` | Generated function I/O API | `str`, `getc`, `pair` | `str` takes string, `pair` takes byte array, `getc` uses callback for streaming | -| `-p` | Production mode | *(no value)* | Generates optimized code | -| `-d` | Output unsupported patterns | filename | Only applies to `rx` (batch mode) | +| Flag | Purpose | Common Options | Notes | +| ---- | ---------------------------- | ------------------------------------------ | ---------------------------------------------------------------- | +| `-r` | Regex dialect | `pcre`, `literal`, `glob`, `native`, `sql` | `pcre` supports the widest set of features | +| `-l` | Output language for printing | `go`, `rust`, `vmc`, `llvm`, `wasm`, `dot` | Use `vmc` for `C` code. Pipe `dot` into `idot` for visualization | +| `-k` | Generated function I/O API | `str`, `getc`, `pair` | `str` takes string, `pair` takes byte array, `getc` uses callback for streaming | +| `-p` | Print mode | *(no value)* | Abbrv. of `-l fsm`. Print the constructed fsm, rather than executing it. | +| `-d` | Declined | filename | Only applies to `rx` (batch mode) | -For more detailed information on flags, see [include/fsm/options.h](../include/fsm/options.h) and the man pages (by running `build/man/re.1/re.1` after `bmake doc`). +This is not exhausted list. For full flag details, see [include/fsm/options.h](../include/fsm/options.h) and the [man pages](../man). +The man pages can be built by running `bmake doc`, then view with `build/man/re.1/re.1`. --- ## Writing Effective libfsm Patterns -For additional regex best practices, see [Fastly's regex guide](https://www.fastly.com/documentation/reference/vcl/regex/#best-practices-and-common-mistakes). - ### 1. Replace Broad Wildcards -Avoid `.*` whenever possible. Use negated character classes: +Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise and forces libfsm to build a large DFA. + +For example, a double-quoted string should not use `".*?"` because the content cannot contain an unescaped quote. +Instead, restrict it to the actual valid characters `"[^"\r\n]*"`, which matches only what is allowd and will keep the DFA more compact. + +Use negated character classes: | Avoid | Better | | ---------- | -------------- | | `<.*>` | `<[^>]*>` | | `\((.*)\)` | `\([^)]*\)` | -| `price=.*` | `price=[0-9]+` | +| `price=.+` | `price=[0-9]+` | +| `var\s.+=` | `var\s[^=]+=` | + +> This is often the cause of an “explosion” in the size of the generated FSM. +> +> See [Compilation Takes Too Long](#compilation-takes-too-long) for more details. --- -### 2. Anchor When You Require Full Matches +### 2. Anchor When Matching Full String -FSMs only do what’s specified. Explicitly anchor when matching entire strings: +When the intention is to match an entire string, use anchors. +Use `^` at the beginning and `\z` for the true end of the string. ```regex -^task-[a-z]+-[0-9]{2}\z +# Correct: matches only this exact hostname +^web\d+\.example\.com\z  + +# Incorrect: would match inside a larger string +web\d+\.example\.com # also matches "foo-web12.example.com-bar" ``` -Use `\z` for end-of-string. +--- + +### 3. Prefer `\z` Over `$` for End-of-String + +`\z` always matches the end of the string. +`$` will also match a trailing newline at the end of the string, +so if you use this in combination with capturing groups, you may not be capturing what you expect. +Also, `\z` is more efficient, so it is better to use it in places where `\n` cannot appear. + +```regex +# Preferred +/foo\z + +# Risky: $ may allow an extra newline +/foo$ +``` --- -## Byte Search Optimization (Optional) +### 4. Escape Special Characters When Used As Literal -Patterns that start with an **uncommon character** can be accelerated using an initial byte scan before running the FSM. -This quickly jumps to likely match positions instead of scanning every byte. +Many characters have special meaning in regex (for example `.`, `+`, `*`, `?`, `[`, `(`). +If you mean to match them literally, escape them: -### Common fast byte search APIs +| Literal You Want | Correct Regex | Explanation | +|----------------------------|-----------------------------|--------------------------------------------| +| `example.com` | `example\.com` | `.` matches any character unless escaped | +| `a+b` | `a\+b` | `+` means “one or more” | +| `price?` | `price\?` | `?` means “optional” | +| `[value]` | `\[value\]` | `[` and `]` start/end a character class | +| `(test)` | `\(test\)` | `(` and `)` begin/end a group | +| Markdown link `[t](u)` | `(\[[^]]*\]\([^)]*\))` | Matches `[text](url)` without crossing `]` or `)` | -| Language | Function | -| -------- | -------------------------- | -| Go | `strings.IndexByte` | -| Rust | `memchr::memchr` | -| C | `memchr` from `` | +--- +### 5. Use Non-Capturing Groups -### Good candidates +Capture groups are _currently_ not supported (coming soon!). +If you need grouping for alternation or precedence, use non-capturing syntax `(?:...)`: -Patterns that always start with uncommon prefix characters, for example: +```regex +# Correct +(?:private|no-store) + +# Unsupported +(private|no-store) +``` + +--- + +## Byte Search Optimization (Optional) + +Patterns that start with an **uncommon character** can be accelerated using an initial byte scan before running the FSM. +This quickly jumps to likely match positions instead of scanning every byte. + +Good candidates are patterns that start with uncommon prefix characters, for example: ``` #tag-[a-z]+ @@ -155,7 +227,9 @@ Patterns that always start with uncommon prefix characters, for example: "name='[^']+'" ``` -These prefixes (`#`, `@`, `[`, `{`, `'`, `"`) rarely appear in normal text, making a byte search highly effective. +These prefixes (`#`, `@`, `[`, `{`, `'`, `"`) are rare in normal text, so a byte search can skip ahead before running the matcher. + +We found using `strings.IndexByte` before calling the generated matcher in Go code significantly improved performance when matching strings with a large (>5k) leading prefix. --- @@ -171,16 +245,20 @@ Pattern: Will compile to code that always returns true. +This is only an issue if that is not what you intend. + **Fix options:** * Require at least one match: `\s+` -* Anchor context: `^\s+$` -* Or alternatively, use `-Fb` flag +* Anchor context: `^\s+$` or alternatively, use `-Fb` flag ### Compilation Takes Too Long -Likely caused by unrestricted wildcards (`.*`, `.+`). Fix with: +This is often caused by unrestricted wildcards (`.*`, `.+`). +Although they look compact, libfsm must enumerate every possible byte and every possible continuation, causing the state machine to grow quickly. + +For example, to match `var anything =`, a pattern such as `var\s.+=` looks simple, but `.+` forces libfsm to encode every possible byte +and every possible continuation -- including both the presence and absence of `=`. This drastically increases the number of states. -* Negated classes (`[^)]*`) -* Bounded repeats (`{0,50}`) -* Pattern splitting +When compilation is slow, look for broad wildcards and replace them with more specific character classes (as shown [above](#writing-effective-libfsm-patterns)), +such as: `var\s[^=]+=`. From 20f262cbaf40454b32ed1245da99555db84a1347 Mon Sep 17 00:00:00 2001 From: Grace Susanto Date: Thu, 27 Nov 2025 15:30:16 +0000 Subject: [PATCH 60/80] Revision #2 --- doc/GUIDE.md | 121 ++++++++++++++++++++------------------------------- 1 file changed, 46 insertions(+), 75 deletions(-) diff --git a/doc/GUIDE.md b/doc/GUIDE.md index ff7abbc7d..26ea155c6 100644 --- a/doc/GUIDE.md +++ b/doc/GUIDE.md @@ -2,13 +2,13 @@ libfsm compiles regular expressions to deterministic finite state machines (FSMs) and generates executable code. FSM-based matching runs in **linear time O(n)** with **no backtracking**. -> Regex engines like PCRE use backtracking to explore multiple possible match paths at **runtime**. -> This means the same pattern can have different execution costs depending on the input. -> -> libfsm instead resolves all match decisions at **compile time** by constructing a Deterministic Finite Automaton (DFA). -> At runtime, matching is a single linear pass over the input with no alternative paths to explore. -> -> As a result, libfsm avoids input-dependent slowdowns and is not susceptible to regular expression–based denial-of-service (ReDoS) attacks. +Regex engines like PCRE use backtracking to explore multiple possible match paths at **runtime**. +This means the same pattern can have different execution costs depending on the input. + +libfsm instead resolves all match decisions at **compile time** by constructing a Deterministic Finite Automaton (DFA). +At runtime, matching is a single linear pass over the input with no alternative paths to explore. + +As a result, libfsm avoids input-dependent slowdowns and is not susceptible to regular expression–based denial-of-service (ReDoS) attacks. **libfsm is not a drop-in replacement for traditional regex engines.** It only supports patterns that can be compiled to FSMs. @@ -21,6 +21,7 @@ libfsm compiles regular expressions to deterministic finite state machines (FSMs - [Writing Effective libfsm Patterns](#writing-effective-libfsm-patterns) - [Byte Search Optimization (Optional)](#byte-search-optimization-optional) - [Troubleshooting](#troubleshooting) +- [Pattern Matches Empty String Unintentionally](#pattern-matches-empty-string-unintentionally) ## What libfsm Cannot Do @@ -33,8 +34,6 @@ These PCRE features will not compile: * Conditional expressions (`(?(condition)then|else)`) * Recursion and subroutines (`(?R)`, `(?1)`) ---- - ## Quick Start Generate a matcher from a regex: @@ -46,21 +45,14 @@ re -p -r pcre -l go -k str 'user\d+' > user_detector.go This produces a standalone matcher function. ---- - ## Supported Code Generation Targets libfsm provides stable, “first-class” code generation for: +- High-level languages: C (via `-l vmc`), Go, Rust +- LLVM IR +- Native WebAssembly -| Category | Output | -| -------------------- | ------------------------------ | -| High-level languages | **C (via `-l vmc`), Go, Rust** | -| Toolchains | **LLVM IR** | -| Virtualization | **Native WebAssembly** | - -> Adding code generation for new languages is straightforward and is defined in [src/libfsm/print/](../src/libfsm/print/). - ---- +Adding code generation for new languages is straightforward and is defined in [src/libfsm/print/](../src/libfsm/print/). ## Workflow Overview @@ -70,11 +62,11 @@ libfsm provides two main tools: A recommended workflow when using libfsm is: -### 1. Validate the Regex +1. Validate the Regex Test behavior using any PCRE-compatible tool (e.g., [pcregrep(1)](https://man7.org/linux/man-pages/man1/pcregrep.1.html) on the CLI or [https://regex101.com/](https://regex101.com/) in the browser). -### 2. Verify libfsm Compatibility +2. Verify libfsm Compatibility ```bash re -r pcre -l ast 'x*?' @@ -87,13 +79,13 @@ rx -r pcre -l ast -d declined.txt 'x*?' If unsupported constructs exist, libfsm reports the failing location. -### 3. Generate Code +3. Generate Code ```bash re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs ``` -### 4. Multiple Patterns +4. Multiple Patterns ```bash # re - patterns from command line: @@ -109,8 +101,6 @@ Both tools: * Pattern ID is argument position for `re`, line number for `rx` * When encountering unsupported patterns: `rx` skips them to `-d` file and generates code with working patterns; `re` fails completely ---- - ### Flag Reference | Flag | Purpose | Common Options | Notes | | ---- | ---------------------------- | ------------------------------------------ | ---------------------------------------------------------------- | @@ -118,23 +108,22 @@ Both tools: | `-l` | Output language for printing | `go`, `rust`, `vmc`, `llvm`, `wasm`, `dot` | Use `vmc` for `C` code. Pipe `dot` into `idot` for visualization | | `-k` | Generated function I/O API | `str`, `getc`, `pair` | `str` takes string, `pair` takes byte array, `getc` uses callback for streaming | | `-p` | Print mode | *(no value)* | Abbrv. of `-l fsm`. Print the constructed fsm, rather than executing it. | -| `-d` | Declined | filename | Only applies to `rx` (batch mode) | +| `-d` | Declined patterns | filename | Only applies to `rx` (batch mode) | This is not exhausted list. For full flag details, see [include/fsm/options.h](../include/fsm/options.h) and the [man pages](../man). -The man pages can be built by running `bmake doc`, then view with `build/man/re.1/re.1`. - ---- +The man pages can be built by running `bmake -r doc`, then view with `build/man/re.1/re.1`. ## Writing Effective libfsm Patterns -### 1. Replace Broad Wildcards +1. Replace Broad Wildcards + +Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise. And although they look compact, libfsm must enumerate every possible byte and continuation. This quickly leads to large DFAs. -Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise and forces libfsm to build a large DFA. +For example, a double-quoted string should not use `".*"` because the content cannot contain an unescaped quote. Using `.*` forces libfsm to consider all characters -- including both the presence and absence of the closing `"` at every step. This greatly increases the number of states. -For example, a double-quoted string should not use `".*?"` because the content cannot contain an unescaped quote. -Instead, restrict it to the actual valid characters `"[^"\r\n]*"`, which matches only what is allowd and will keep the DFA more compact. +Instead, restrict it to the actual valid characters `"[^"\r\n]*"`, which matches only what is allowed and will keep the DFA more compact. -Use negated character classes: +Use negated character classes to match only the allowed content: | Avoid | Better | | ---------- | -------------- | @@ -143,45 +132,46 @@ Use negated character classes: | `price=.+` | `price=[0-9]+` | | `var\s.+=` | `var\s[^=]+=` | -> This is often the cause of an “explosion” in the size of the generated FSM. -> -> See [Compilation Takes Too Long](#compilation-takes-too-long) for more details. +The overlap between `.*` or `.+` and strings that follow is often the cause of an “explosion” in the size of the generated FSM. So when compilation is slow or generated output is large, look for `.*` and `.+` first and replace them with a narrower character class. ---- - -### 2. Anchor When Matching Full String +2. Anchor When Matching Full String When the intention is to match an entire string, use anchors. Use `^` at the beginning and `\z` for the true end of the string. ```regex # Correct: matches only this exact hostname +# Matches "web12.example.com" +# Does not match "foo-web12.example.com-bar" ^web\d+\.example\.com\z  # Incorrect: would match inside a larger string -web\d+\.example\.com # also matches "foo-web12.example.com-bar" +# Matches "web12.example.com" +# Also matches "foo-web12.example.com-bar" +web\d+\.example\.com ``` ---- - -### 3. Prefer `\z` Over `$` for End-of-String +3. Prefer `\z` Over `$` for End-of-String `\z` always matches the end of the string. `$` will also match a trailing newline at the end of the string, so if you use this in combination with capturing groups, you may not be capturing what you expect. -Also, `\z` is more efficient, so it is better to use it in places where `\n` cannot appear. +Also, `\z` produces a smaller FSM, so it is better to use it in places where `\n` cannot appear. ```regex -# Preferred -/foo\z - -# Risky: $ may allow an extra newline -/foo$ +# Preferred: matches only if the string ends with "bar" +# Matches "/foo/bar" +# Does NOT match "/foo/bar\n" +/bar\z + +# Incorrect: allows a trailing newline, +# which is usually unintended and adds unnecessary complexity +# Matches "/foo/bar" +# Also matches "/foo/bar\n" +/bar$ ``` ---- - -### 4. Escape Special Characters When Used As Literal +4. Escape Special Characters When Used As Literal Many characters have special meaning in regex (for example `.`, `+`, `*`, `?`, `[`, `(`). If you mean to match them literally, escape them: @@ -195,9 +185,7 @@ If you mean to match them literally, escape them: | `(test)` | `\(test\)` | `(` and `)` begin/end a group | | Markdown link `[t](u)` | `(\[[^]]*\]\([^)]*\))` | Matches `[text](url)` without crossing `]` or `)` | ---- - -### 5. Use Non-Capturing Groups +5. Use Non-Capturing Groups Capture groups are _currently_ not supported (coming soon!). If you need grouping for alternation or precedence, use non-capturing syntax `(?:...)`: @@ -210,8 +198,6 @@ If you need grouping for alternation or precedence, use non-capturing syntax `(? (private|no-store) ``` ---- - ## Byte Search Optimization (Optional) Patterns that start with an **uncommon character** can be accelerated using an initial byte scan before running the FSM. @@ -231,11 +217,7 @@ These prefixes (`#`, `@`, `[`, `{`, `'`, `"`) are rare in normal text, so a byte We found using `strings.IndexByte` before calling the generated matcher in Go code significantly improved performance when matching strings with a large (>5k) leading prefix. ---- - -## Troubleshooting - -### Pattern Matches Empty String Unintentionally +## Pattern Matches Empty String Unintentionally Pattern: @@ -251,14 +233,3 @@ This is only an issue if that is not what you intend. * Require at least one match: `\s+` * Anchor context: `^\s+$` or alternatively, use `-Fb` flag - -### Compilation Takes Too Long - -This is often caused by unrestricted wildcards (`.*`, `.+`). -Although they look compact, libfsm must enumerate every possible byte and every possible continuation, causing the state machine to grow quickly. - -For example, to match `var anything =`, a pattern such as `var\s.+=` looks simple, but `.+` forces libfsm to encode every possible byte -and every possible continuation -- including both the presence and absence of `=`. This drastically increases the number of states. - -When compilation is slow, look for broad wildcards and replace them with more specific character classes (as shown [above](#writing-effective-libfsm-patterns)), -such as: `var\s[^=]+=`. From 227f4ac04910ca95913c830858749adb1a310e18 Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 27 Nov 2025 15:53:27 +0000 Subject: [PATCH 61/80] Naming. --- README.md | 9 +++------ doc/{GUIDE.md => advice.md} | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) rename doc/{GUIDE.md => advice.md} (99%) diff --git a/README.md b/README.md index d72b4fece..d30e405b1 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ ; re -cb -pl dot '[Ll]ibf+(sm)*' '[Ll]ibre' | dot ![libfsm.svg](doc/tutorial/libfsm.svg) +libfsm is not a drop-in replacement for other regex engines, and it only supports patterns that can be compiled to deterministic FSMs. In return, supported patterns run in linear time. + Getting started: * See the [tutorial introduction](doc/tutorial/re.md) for a quick overview of the re(1) command line interface. * [Compilation phases](doc/tutorial/phases.md) for typical applications which compile regular expressions to code. + * [Advice on using libfsm](doc/advice.md) for suggestions around compilation time, unsupported features, common usage patterns, and examples. You get: @@ -24,12 +27,6 @@ lx is an attempt to produce a simple, expressive, and unobtrusive lexer generator which is good at lexing, does just lexing, is language independent, and has no other features. -### Performance and Requirements - -libfsm is not a drop-in replacement for other regex engines and it only supports patterns that can be compiled to deterministic FSMs. In return, supported patterns run in linear time. - -**→ See [doc/GUIDE.md](doc/GUIDE.md) for detailed unsupported features, usage patterns, and examples.** - ### Building from source Clone with submodules (contains required .mk files): diff --git a/doc/GUIDE.md b/doc/advice.md similarity index 99% rename from doc/GUIDE.md rename to doc/advice.md index 26ea155c6..d2dc402b6 100644 --- a/doc/GUIDE.md +++ b/doc/advice.md @@ -1,4 +1,4 @@ -# Using libfsm for High-Performance Pattern Matching +# Advice on using libfsm for high-performance pattern matching libfsm compiles regular expressions to deterministic finite state machines (FSMs) and generates executable code. FSM-based matching runs in **linear time O(n)** with **no backtracking**. From c4d4ffe9b08e70103db626552bb0d638932d07c3 Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 27 Nov 2025 16:09:46 +0000 Subject: [PATCH 62/80] Markup. --- doc/advice.md | 187 +++++++++++++++++++++++++------------------------- 1 file changed, 95 insertions(+), 92 deletions(-) diff --git a/doc/advice.md b/doc/advice.md index d2dc402b6..dff982acc 100644 --- a/doc/advice.md +++ b/doc/advice.md @@ -19,7 +19,7 @@ As a result, libfsm avoids input-dependent slowdowns and is not susceptible to r - [Supported Code Generation Targets](#supported-code-generation-targets) - [Workflow Overview](#workflow-overview) - [Writing Effective libfsm Patterns](#writing-effective-libfsm-patterns) -- [Byte Search Optimization (Optional)](#byte-search-optimization-optional) +- [Byte Search Optimization](#byte-search-optimization-optional) - [Troubleshooting](#troubleshooting) - [Pattern Matches Empty String Unintentionally](#pattern-matches-empty-string-unintentionally) @@ -38,7 +38,7 @@ These PCRE features will not compile: Generate a matcher from a regex: -```bash +```sh # Generate a Go matcher re -p -r pcre -l go -k str 'user\d+' > user_detector.go ``` @@ -56,151 +56,154 @@ Adding code generation for new languages is straightforward and is defined in [s ## Workflow Overview -libfsm provides two main tools: - - **`re`** takes patterns from command line - - **`rx`** takes patterns from file +libfsm provides two main tools for pattern matching: + - **`re`** takes patterns from the command line + - **`rx`** takes patterns from a file A recommended workflow when using libfsm is: 1. Validate the Regex -Test behavior using any PCRE-compatible tool (e.g., [pcregrep(1)](https://man7.org/linux/man-pages/man1/pcregrep.1.html) on the CLI or [https://regex101.com/](https://regex101.com/) in the browser). + Test behavior using any PCRE-compatible tool (e.g., [pcregrep(1)](https://man7.org/linux/man-pages/man1/pcregrep.1.html) on the CLI or [https://regex101.com/](https://regex101.com/) in the browser). 2. Verify libfsm Compatibility -```bash -re -r pcre -l ast 'x*?' -# Output: /x*?/:3: Unsupported operator -# :3 indicates that the character at offset 3 in the pattern is rejected. + If unsupported constructs exist, libfsm reports the failing location: + ```sh + re -r pcre -l ast 'x*?' + # Output: /x*?/:3: Unsupported operator + ``` + In this example, `:3` indicates that the character at byte offset three in the pattern is an unsupported feature. -rx -r pcre -l ast -d declined.txt 'x*?' -# Unsupported character in declined.txt -``` + ```sh + # patterns with unsupported operators are output to declined.txt + rx -r pcre -l ast -d declined.txt 'x*?' + ``` -If unsupported constructs exist, libfsm reports the failing location. 3. Generate Code -```bash -re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs -``` + ```sh + re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs + ``` 4. Multiple Patterns -```bash -# re - patterns from command line: -re -p -r pcre -l go -k str '^x?a b+c$' '^x*def?$' '^x$' - -# rx - patterns from file: -rx -p -r pcre -l vmc -k str -d skipped.txt patterns.txt > detectors.c -``` + ```sh + # re - patterns from command line: + re -p -r pcre -l go -k str '^x?a b+c$' '^x*def?$' '^x$' + + # rx - patterns from file: + rx -p -r pcre -l vmc -k str -d skipped.txt patterns.txt > detectors.c + ``` Both tools: * Combine all patterns into one function (like using `|` to join them) -* Return `(bool, int)` - match status and pattern ID +* Generate code that can return `(bool, int)` for the match status and pattern ID * Pattern ID is argument position for `re`, line number for `rx` -* When encountering unsupported patterns: `rx` skips them to `-d` file and generates code with working patterns; `re` fails completely +* When encountering unsupported patterns: `rx` can decline them to `-d` file and generates code with working patterns only; `re` fails completely + +### Common Flags -### Flag Reference | Flag | Purpose | Common Options | Notes | -| ---- | ---------------------------- | ------------------------------------------ | ---------------------------------------------------------------- | +|:----:|:---------------------------- |:------------------------------------------ |:---------------------------------------------------------------- | | `-r` | Regex dialect | `pcre`, `literal`, `glob`, `native`, `sql` | `pcre` supports the widest set of features | | `-l` | Output language for printing | `go`, `rust`, `vmc`, `llvm`, `wasm`, `dot` | Use `vmc` for `C` code. Pipe `dot` into `idot` for visualization | | `-k` | Generated function I/O API | `str`, `getc`, `pair` | `str` takes string, `pair` takes byte array, `getc` uses callback for streaming | | `-p` | Print mode | *(no value)* | Abbrv. of `-l fsm`. Print the constructed fsm, rather than executing it. | | `-d` | Declined patterns | filename | Only applies to `rx` (batch mode) | -This is not exhausted list. For full flag details, see [include/fsm/options.h](../include/fsm/options.h) and the [man pages](../man). +This is not an exhaustive list. For full flag details, see [include/fsm/options.h](../include/fsm/options.h) and the [man pages](../man). The man pages can be built by running `bmake -r doc`, then view with `build/man/re.1/re.1`. ## Writing Effective libfsm Patterns 1. Replace Broad Wildcards -Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise. And although they look compact, libfsm must enumerate every possible byte and continuation. This quickly leads to large DFAs. + Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise. And although they look compact, libfsm must enumerate every possible byte and continuation. This quickly leads to large DFAs. -For example, a double-quoted string should not use `".*"` because the content cannot contain an unescaped quote. Using `.*` forces libfsm to consider all characters -- including both the presence and absence of the closing `"` at every step. This greatly increases the number of states. + For example, a double-quoted string should not use `".*"` because the content cannot contain an unescaped quote. Using `.*` forces libfsm to consider all characters -- including both the presence and absence of the closing `"` at every step. This greatly increases the number of states. -Instead, restrict it to the actual valid characters `"[^"\r\n]*"`, which matches only what is allowed and will keep the DFA more compact. + Instead, restrict it to the actual valid characters `"[^"\r\n]*"`, which matches only what is allowed and will keep the DFA more compact. -Use negated character classes to match only the allowed content: + Use negated character classes to match only the allowed content: -| Avoid | Better | -| ---------- | -------------- | -| `<.*>` | `<[^>]*>` | -| `\((.*)\)` | `\([^)]*\)` | -| `price=.+` | `price=[0-9]+` | -| `var\s.+=` | `var\s[^=]+=` | + | Avoid | Better | + | ---------- | -------------- | + | `<.*>` | `<[^>]*>` | + | `\((.*)\)` | `\([^)]*\)`| + | `price=.+` | `price=[0-9]+` | + | `var\s.+=` | `var\s[^=]+=` | -The overlap between `.*` or `.+` and strings that follow is often the cause of an “explosion” in the size of the generated FSM. So when compilation is slow or generated output is large, look for `.*` and `.+` first and replace them with a narrower character class. + The overlap between `.*` or `.+` and strings that follow is often the cause of an “explosion” in the size of the generated FSM. So when compilation is slow or generated output is large, look for `.*` and `.+` first and replace them with a narrower character class. 2. Anchor When Matching Full String -When the intention is to match an entire string, use anchors. -Use `^` at the beginning and `\z` for the true end of the string. - -```regex -# Correct: matches only this exact hostname -# Matches "web12.example.com" -# Does not match "foo-web12.example.com-bar" -^web\d+\.example\.com\z  - -# Incorrect: would match inside a larger string -# Matches "web12.example.com" -# Also matches "foo-web12.example.com-bar" -web\d+\.example\.com -``` + When the intention is to match an entire string, use anchors. + Use `^` at the beginning and `\z` for the true end of the string. + + ```regex + # Correct: matches only this exact hostname + # Matches "web12.example.com" + # Does not match "foo-web12.example.com-bar" + ^web\d+\.example\.com\z  + + # Incorrect: would match inside a larger string + # Matches "web12.example.com" + # Also matches "foo-web12.example.com-bar" + web\d+\.example\.com + ``` 3. Prefer `\z` Over `$` for End-of-String -`\z` always matches the end of the string. -`$` will also match a trailing newline at the end of the string, -so if you use this in combination with capturing groups, you may not be capturing what you expect. -Also, `\z` produces a smaller FSM, so it is better to use it in places where `\n` cannot appear. - -```regex -# Preferred: matches only if the string ends with "bar" -# Matches "/foo/bar" -# Does NOT match "/foo/bar\n" -/bar\z - -# Incorrect: allows a trailing newline, -# which is usually unintended and adds unnecessary complexity -# Matches "/foo/bar" -# Also matches "/foo/bar\n" -/bar$ -``` + `\z` always matches the end of the string. + `$` will also match a trailing newline at the end of the string, + so if you use this in combination with capturing groups, you may not be capturing what you expect. + Also, `\z` produces a smaller FSM, so it is better to use it in places where `\n` cannot appear. + + ```regex + # Preferred: matches only if the string ends with "bar" + # Matches "/foo/bar" + # Does NOT match "/foo/bar\n" + /bar\z + + # Incorrect: allows a trailing newline, + # which is usually unintended and adds unnecessary complexity + # Matches "/foo/bar" + # Also matches "/foo/bar\n" + /bar$ + ``` 4. Escape Special Characters When Used As Literal -Many characters have special meaning in regex (for example `.`, `+`, `*`, `?`, `[`, `(`). -If you mean to match them literally, escape them: + Many characters have special meaning in regex (for example `.`, `+`, `*`, `?`, `[`, `(`). + If you mean to match them literally, escape them: -| Literal You Want | Correct Regex | Explanation | -|----------------------------|-----------------------------|--------------------------------------------| -| `example.com` | `example\.com` | `.` matches any character unless escaped | -| `a+b` | `a\+b` | `+` means “one or more” | -| `price?` | `price\?` | `?` means “optional” | -| `[value]` | `\[value\]` | `[` and `]` start/end a character class | -| `(test)` | `\(test\)` | `(` and `)` begin/end a group | -| Markdown link `[t](u)` | `(\[[^]]*\]\([^)]*\))` | Matches `[text](url)` without crossing `]` or `)` | + | Literal You Want | Correct Regex | Explanation | + |----------------------------|-----------------------------|--------------------------------------------| + | `example.com` | `example\.com` | `.` matches any character unless escaped | + | `a+b` | `a\+b` | `+` means “one or more” | + | `price?` | `price\?` | `?` means “optional” | + | `[value]` | `\[value\]` | `[` and `]` start/end a character class | + | `(test)` | `\(test\)` | `(` and `)` begin/end a group | + | Markdown link `[t](u)` | `(\[[^]]*\]\([^)]*\))` | Matches `[text](url)` without crossing `]` or `)` | 5. Use Non-Capturing Groups -Capture groups are _currently_ not supported (coming soon!). -If you need grouping for alternation or precedence, use non-capturing syntax `(?:...)`: + Capture groups are _currently_ not supported (coming soon!). + If you need grouping for alternation or precedence, use non-capturing syntax `(?:...)`: -```regex -# Correct -(?:private|no-store) - -# Unsupported -(private|no-store) -``` + ```regex + # Correct + (?:private|no-store) + + # Unsupported + (private|no-store) + ``` -## Byte Search Optimization (Optional) +## Byte Search Optimization -Patterns that start with an **uncommon character** can be accelerated using an initial byte scan before running the FSM. +Patterns that start with an uncommon character can be accelerated using an initial byte scan before running the FSM. This quickly jumps to likely match positions instead of scanning every byte. Good candidates are patterns that start with uncommon prefix characters, for example: From bf867f2eb6c45f81c4e3bbb2b07330e5fded396b Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 27 Nov 2025 16:21:30 +0000 Subject: [PATCH 63/80] Blurb on calling the generated code. --- doc/advice.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/doc/advice.md b/doc/advice.md index dff982acc..38c61d8c9 100644 --- a/doc/advice.md +++ b/doc/advice.md @@ -62,11 +62,11 @@ libfsm provides two main tools for pattern matching: A recommended workflow when using libfsm is: -1. Validate the Regex +1. Validate the regex Test behavior using any PCRE-compatible tool (e.g., [pcregrep(1)](https://man7.org/linux/man-pages/man1/pcregrep.1.html) on the CLI or [https://regex101.com/](https://regex101.com/) in the browser). -2. Verify libfsm Compatibility +2. Verify libfsm compatibility If unsupported constructs exist, libfsm reports the failing location: ```sh @@ -81,13 +81,13 @@ A recommended workflow when using libfsm is: ``` -3. Generate Code +3. Generate code ```sh re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs ``` -4. Multiple Patterns +4. Multiple patterns ```sh # re - patterns from command line: @@ -97,6 +97,12 @@ A recommended workflow when using libfsm is: rx -p -r pcre -l vmc -k str -d skipped.txt patterns.txt > detectors.c ``` +5. Call the generated code from your program somehow + + You're on your own for this. `-k` controls the API for the generated code to read in data to match. Try different options for the language you're using and see which suits you. + + The generated API can also vary depending on how you want libfsm to handle ambiguities between different patterns. See the `AMBIG_*` flags in [include/fsm/options.h](../include/fsm/options.h) for different approaches there. + Both tools: * Combine all patterns into one function (like using `|` to join them) * Generate code that can return `(bool, int)` for the match status and pattern ID @@ -114,7 +120,7 @@ Both tools: | `-d` | Declined patterns | filename | Only applies to `rx` (batch mode) | This is not an exhaustive list. For full flag details, see [include/fsm/options.h](../include/fsm/options.h) and the [man pages](../man). -The man pages can be built by running `bmake -r doc`, then view with `build/man/re.1/re.1`. +The man pages can be built by running `bmake -r doc`, then view with `man build/man/re.1/re.1`. ## Writing Effective libfsm Patterns From 4351273fe990941fc4e80c231e1018140ffe177a Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 27 Nov 2025 16:50:16 +0000 Subject: [PATCH 64/80] Blurb on bounded repetition. (Contributed by Scott) --- doc/advice.md | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/doc/advice.md b/doc/advice.md index 38c61d8c9..577388da5 100644 --- a/doc/advice.md +++ b/doc/advice.md @@ -87,7 +87,13 @@ A recommended workflow when using libfsm is: re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs ``` -4. Multiple patterns +4. Use multiple patterns + + Execution complexity for the generated code is proportional to the length of the text being matched, not to the number of patterns. + Assuming your generated code isn't too large to compile, this means you can have as many patterns as you want, + for the same time it takes to execute a single pattern. + + Take advantage of this. ```sh # re - patterns from command line: @@ -124,7 +130,19 @@ The man pages can be built by running `bmake -r doc`, then view with `man build/ ## Writing Effective libfsm Patterns -1. Replace Broad Wildcards +Generally, to keep generated code compact, stick to the least expressive subset of features. + +libfsm has no way to know in advance what text you'll be passing to its generated code. +For example, are you matching a string that you know will never contain a newline? +libfsm doesn't know that. +It has to generate code that's capable of handling any input. +You can help it out by making your patterns precise. + +Think about what you intend your pattern to match, and what it's actually capable of matching given arbitrary text. +This helps restrict the scope of your pattern from arbitrary text to exactly what you mean. +The following bits of advice illustrate various specific ways to bring down this scope. + +1. Replace broad wildcards Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise. And although they look compact, libfsm must enumerate every possible byte and continuation. This quickly leads to large DFAs. @@ -143,7 +161,14 @@ The man pages can be built by running `bmake -r doc`, then view with `man build/ The overlap between `.*` or `.+` and strings that follow is often the cause of an “explosion” in the size of the generated FSM. So when compilation is slow or generated output is large, look for `.*` and `.+` first and replace them with a narrower character class. -2. Anchor When Matching Full String +2. Take care with bounded repetition + + If you have the pattern ^x{3,5}$, libfsm's resulting DFA will be structured like "match an x, then match an x, then match an x, then match an x or skip it, then match an x or skip it, then report an overall match if at the end of input". It has to repeat the pattern, noting each time whether it's required or optional (beyond the lower count in {min,max}), because DFA execution doesn't have a counter, just the current state within the overall DFA. + + When the subexpression (represented by `x`) unintentionally matches too many things, they all have to be spelled out every time. + So pay especially close attention to tightening up subexpressions in bounded repetition clauses. + +3. Anchor when matching full string When the intention is to match an entire string, use anchors. Use `^` at the beginning and `\z` for the true end of the string. @@ -160,7 +185,7 @@ The man pages can be built by running `bmake -r doc`, then view with `man build/ web\d+\.example\.com ``` -3. Prefer `\z` Over `$` for End-of-String +4. Prefer `\z` over `$` for End-of-String `\z` always matches the end of the string. `$` will also match a trailing newline at the end of the string, @@ -180,7 +205,7 @@ The man pages can be built by running `bmake -r doc`, then view with `man build/ /bar$ ``` -4. Escape Special Characters When Used As Literal +5. Escape special characters when used as literals Many characters have special meaning in regex (for example `.`, `+`, `*`, `?`, `[`, `(`). If you mean to match them literally, escape them: @@ -194,16 +219,23 @@ The man pages can be built by running `bmake -r doc`, then view with `man build/ | `(test)` | `\(test\)` | `(` and `)` begin/end a group | | Markdown link `[t](u)` | `(\[[^]]*\]\([^)]*\))` | Matches `[text](url)` without crossing `]` or `)` | -5. Use Non-Capturing Groups + The `.` wildcard in particular is often mistakenly left unescaped in practice. + On testing, it will match a literal `.` as intended. But it will also match any other character. + This means that not only is your pattern incorrect (write negative test cases!), + but also this part of your FSM is 256 times larger than it should be. + +6. Use non-capturing groups Capture groups are _currently_ not supported (coming soon!). - If you need grouping for alternation or precedence, use non-capturing syntax `(?:...)`: + + If you don't need to capture things, don't use capture. + If you need grouping for alternation or precedence, use PCRE's non-capturing syntax `(?:...)`: ```regex # Correct (?:private|no-store) - # Unsupported + # Not what's intended (private|no-store) ``` @@ -214,7 +246,7 @@ This quickly jumps to likely match positions instead of scanning every byte. Good candidates are patterns that start with uncommon prefix characters, for example: -``` +```regex #tag-[a-z]+ @user-[0-9]+ \[section\] From d8ab92e5834bf8e3a5bcd0f7ac6e3b94ffe5a639 Mon Sep 17 00:00:00 2001 From: Kate F Date: Thu, 27 Nov 2025 17:02:33 +0000 Subject: [PATCH 65/80] Markup. --- doc/advice.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/advice.md b/doc/advice.md index 577388da5..0eb429f94 100644 --- a/doc/advice.md +++ b/doc/advice.md @@ -163,7 +163,7 @@ The following bits of advice illustrate various specific ways to bring down this 2. Take care with bounded repetition - If you have the pattern ^x{3,5}$, libfsm's resulting DFA will be structured like "match an x, then match an x, then match an x, then match an x or skip it, then match an x or skip it, then report an overall match if at the end of input". It has to repeat the pattern, noting each time whether it's required or optional (beyond the lower count in {min,max}), because DFA execution doesn't have a counter, just the current state within the overall DFA. + If you have the pattern `^x{3,5}$`, libfsm's resulting DFA will be structured like "match an x, then match an x, then match an x, then match an x or skip it, then match an x or skip it, then report an overall match if at the end of input". It has to repeat the pattern, noting each time whether it's required or optional (beyond the lower count in `{min,max}`), because DFA execution doesn't have a counter, just the current state within the overall DFA. When the subexpression (represented by `x`) unintentionally matches too many things, they all have to be spelled out every time. So pay especially close attention to tightening up subexpressions in bounded repetition clauses. From e6683daeb5bd8a69e1e69d4546d9c3f244d34e4d Mon Sep 17 00:00:00 2001 From: Kate F Date: Fri, 16 Jan 2026 17:05:07 +0000 Subject: [PATCH 66/80] First cut at re_interpolate_groups() --- Makefile | 1 + include/re/groups.h | 46 ++++++ src/libre/Makefile | 1 + src/libre/libre.syms | 1 + src/libre/re_interpolate_groups.c | 147 ++++++++++++++++++ tests/re_interpolate_groups/Makefile | 23 +++ .../re_interpolate_groups0.c | 73 +++++++++ .../re_interpolate_groups1.c | 63 ++++++++ 8 files changed, 355 insertions(+) create mode 100644 include/re/groups.h create mode 100644 src/libre/re_interpolate_groups.c create mode 100644 tests/re_interpolate_groups/Makefile create mode 100644 tests/re_interpolate_groups/re_interpolate_groups0.c create mode 100644 tests/re_interpolate_groups/re_interpolate_groups1.c diff --git a/Makefile b/Makefile index ad16b1ec7..4065c28b0 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,7 @@ SUBDIR += tests/sql SUBDIR += tests/queue SUBDIR += tests/aho_corasick SUBDIR += tests/retest +SUBDIR += tests/re_interpolate_groups SUBDIR += tests .if make(theft) || make(${BUILD}/theft/theft) SUBDIR += theft diff --git a/include/re/groups.h b/include/re/groups.h new file mode 100644 index 000000000..0c9a77fb3 --- /dev/null +++ b/include/re/groups.h @@ -0,0 +1,46 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#ifndef RE_GROUPS_H +#define RE_GROUPS_H + +struct re_pos; + +/* + * esc is the character for escaping group references, + * typically '\\' or '$'. + * + * group0 is passed separately for caller convenience, + * so you don't have to construct a single array for + * all groups. It's supposed to be the entire string + * that matched. group0 may not be NULL. + * + * groupv is 0-indexed meaning group $1 onwards. + * groupc is the count of elements in groupv. + * + * nonexistent is what to do about references to groups + * that are outside the bounds of the array. NULL means + * to error, otherwise the string value will be used. + * Typically this would be passed as "". + * + * You can distinguish compile-time errors (that is, + * syntax errors in the format string) vs. runtime errors + * (that is, nonexistent groups) by calling + * re_interpolate_groups() ahead of time with groupc = 0 + * and passing a non-NULL nonexistent value. + * + * The output string will always be less than or equal in + * length to the format string. The output is \0-terminated. + * outn includes the \0. + */ +bool +re_interpolate_groups(const char *fmt, char esc, + const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, + char *outs, size_t outn, + struct re_pos *pos); + +#endif + diff --git a/src/libre/Makefile b/src/libre/Makefile index 67617ab92..36601004a 100644 --- a/src/libre/Makefile +++ b/src/libre/Makefile @@ -10,6 +10,7 @@ SRC += src/libre/ast_new_from_fsm.c SRC += src/libre/ast_rewrite.c SRC += src/libre/ac.c SRC += src/libre/print.c +SRC += src/libre/re_interpolate_groups.c SRC += src/libre/re_strings.c # generated diff --git a/src/libre/libre.syms b/src/libre/libre.syms index 9d381cb0f..b04833777 100644 --- a/src/libre/libre.syms +++ b/src/libre/libre.syms @@ -4,6 +4,7 @@ re_flags re_strerror re_perror re_is_anchored +re_interpolate_groups ast_print ast_print_dot diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c new file mode 100644 index 000000000..5c9752a41 --- /dev/null +++ b/src/libre/re_interpolate_groups.c @@ -0,0 +1,147 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +// TODO +#define OUT_CHAR(c) do { if (outn < 1) { goto error; } *outs++ = (c); outn--; } while (0) +#define OUT_GROUP(s) do { if (outn < strlen((s))) { goto error; } outs += sprintf(outs, "%s", (s)); outn -= strlen((s)); } while (0) + +// TODO: return values: syntax error, nonexistent group error (digit overflow is the same thing), success +bool +re_interpolate_groups(const char *fmt, char esc, + const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, + char *outs, size_t outn, + struct re_pos *pos) +{ + unsigned group; // 0 meaning group0, 1 meaning groupv[0], etc + const char *p; + + enum { + STATE_LIT, + STATE_ESC, + STATE_DIGIT + } state; + + assert(esc != '\0'); + assert(group0 != NULL || groupc == 0); + assert(groupc < UINT_MAX / 10 - 1); + assert(outs != NULL); + + state = STATE_LIT; + group = 0; + + p = fmt; + do { + switch (state) { + case STATE_LIT: + if (*p == '\0') { + break; + } + + if (*p == esc) { + state = STATE_ESC; + continue; + } + + OUT_CHAR(*p); + continue; + + case STATE_ESC: + if (*p == '\0') { + goto error; + } + + if (*p == esc) { + OUT_CHAR(esc); + state = STATE_LIT; + continue; + } + + if (isdigit((unsigned char) *p)) { + group = *p - '0'; + state = STATE_DIGIT; + continue; + } + + goto error; + + case STATE_DIGIT: + if (isdigit((unsigned char) *p)) { + group *= 10; + group += *p - '0'; + +// TODO: explain this +// digit overflow, we cap to groupc + 1 +// groupc + 1 is always out of bounds +// this is a simple way to avoid needing to handle digit overflow for subsequent digits, +// assuming groupc *= 10 is <= UINT_MAX + if (group > groupc) { + group = groupc + 1; + } + continue; + } + + if (group == 0) { + OUT_GROUP(group0); + } else if (group <= groupc) { + assert(groupv[group - 1] != NULL); + OUT_GROUP(groupv[group - 1]); + } else if (nonexistent == NULL) { +// TODO: maybe want to indicate this independently from syntax errors +// TODO: no need, you can pre-check the entire syntax by running with 0 groups + goto error; + } else { + OUT_GROUP(nonexistent); + } + + group = 0; + state = STATE_LIT; + + if (*p == '\0') { + break; + } + + if (*p == esc) { + state = STATE_ESC; + continue; + } + + OUT_CHAR(*p); + continue; + + default: + assert(!"unreached"); + goto error; + } + } while (*p++); + + if (state != STATE_LIT) { + goto error; + } + + OUT_CHAR('\0'); + + return true; + +error: + + // TODO: track start,end independently + if (pos != NULL) { + pos->byte = p - fmt; + } + + return false; +} + diff --git a/tests/re_interpolate_groups/Makefile b/tests/re_interpolate_groups/Makefile new file mode 100644 index 000000000..41f9e2599 --- /dev/null +++ b/tests/re_interpolate_groups/Makefile @@ -0,0 +1,23 @@ +.include "../../share/mk/top.mk" + +TEST.tests/re_interpolate_groups != ls -1 tests/re_interpolate_groups/re_interpolate_groups*.c +TEST_SRCDIR.tests/re_interpolate_groups = tests/re_interpolate_groups +TEST_OUTDIR.tests/re_interpolate_groups = ${BUILD}/tests/re_interpolate_groups + +.for n in ${TEST.tests/re_interpolate_groups:T:R:C/^re_interpolate_groups//} +test:: ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n} +SRC += ${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c +#CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -UNDEBUG +CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -std=c99 + +${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o ${BUILD}/src/libre/re_interpolate_groups.o + +${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} + ( ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n} + +#.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +#${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${BUILD}/lib/${lib:R}.a +#.endfor +.endfor + diff --git a/tests/re_interpolate_groups/re_interpolate_groups0.c b/tests/re_interpolate_groups/re_interpolate_groups0.c new file mode 100644 index 000000000..28311afb9 --- /dev/null +++ b/tests/re_interpolate_groups/re_interpolate_groups0.c @@ -0,0 +1,73 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include + +#include +#include + +static unsigned failed; + +static void +test(const char *fmt, size_t groupc, const char *groupv[], const char *expected) +{ + struct re_pos pos; + char outs[40]; + bool r; + + assert(fmt != NULL); + assert(expected != NULL); + + if (!re_interpolate_groups(fmt, '$', "", groupc, groupv, "", outs, sizeof outs, &pos)) { + printf("%s/%zu XXX\n", fmt, groupc); + failed++; + return; + } + + failed += r = 0 != strcmp(outs, expected); + + printf("%s/%zu => %s%s\n", fmt, groupc, outs, + r ? " XXX" : ""); +} + +int main(void) { + const char *gn[] = { "one", "two", "three", "four" }; + const char **g0 = NULL; + const char *ga[] = { "1" }; + const char *gb[] = { "" }; +// const char *gc[] = { NULL }; // XXX: not permitted + + test("", 0, g0, ""); + test("", 4, gn, ""); + + test("x", 0, g0, "x"); + test("x", 4, gn, "x"); + + test("\001", 0, g0, "\001"); + test("\001", 4, gn, "\001"); + + test("$0", 0, gn, ""); + test("x$000000000000000000000x", 0, gn, "xx"); + test("x$000000000000000000001x", 1, gn, "xonex"); + test("x$100000000000000000000x", 1, gn, "xx"); + + test("$$$1$1$2$1$3$4$3$2$1$$$$", 4, gn, "$oneonetwoonethreefourthreetwoone$$"); + + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 4, gn, "xyz_one..three;three,$.one-four="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 3, gn, "xyz_one..three;three,$.one-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 2, gn, "xyz_one..;,$.one-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gn, "xyz_one..;,$.one-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, g0, "xyz_..;,$.-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, ga, "xyz_1..;,$.1-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gb, "xyz_..;,$.-="); +// test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, gc, "xyz_..;,$.-="); + + return failed; +} + diff --git a/tests/re_interpolate_groups/re_interpolate_groups1.c b/tests/re_interpolate_groups/re_interpolate_groups1.c new file mode 100644 index 000000000..c75122e1d --- /dev/null +++ b/tests/re_interpolate_groups/re_interpolate_groups1.c @@ -0,0 +1,63 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include + +#include +#include + +static unsigned failed; + +static void +test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, unsigned expected_pos) +{ + struct re_pos pos; + char outs[10]; + bool r; + + assert(fmt != NULL); + + /* for these tests we're expecting to error */ + if (re_interpolate_groups(fmt, '$', "", groupc, groupv, ne, outs, sizeof outs, &pos)) { + printf("%s/%zu XXX\n", fmt, groupc); + failed++; + return; + } + + failed += r = expected_pos != pos.byte; + + printf("%s/%zu => :%u :%u%s\n", fmt, groupc, + pos.byte, expected_pos, + r ? " XXX" : ""); +} + +int main(void) { + const char *ne = ""; + + const char *gn[] = { "one", "two", "three", "four" }; + const char **g0 = NULL; + const char *ga[] = { "1" }; + const char *gb[] = { "" }; +// const char *gc[] = { NULL }; // XXX: not permitted + + test_err("$", 0, g0, ne, 1); + test_err("$x", 0, g0, ne, 1); + test_err("$ ", 4, gn, ne, 1); + test_err("$\\01", 0, g0, ne, 1); + + test_err("$$$x", 4, gn, ne, 3); + + test_err("xyz$1", 0, gn, NULL, 5); + test_err("xyz$2", 1, gn, NULL, 5); + + test_err("01234567890", 1, gn, ne, 10); + + return failed; +} + From 2f0c7722929f0341f782dac1e230bc5a9907933b Mon Sep 17 00:00:00 2001 From: Kate F Date: Mon, 26 Jan 2026 01:40:27 +0000 Subject: [PATCH 67/80] Add start,end error reporting --- include/re/groups.h | 4 +- src/libre/re_interpolate_groups.c | 33 +++++++++++--- .../re_interpolate_groups0.c | 5 +-- .../re_interpolate_groups1.c | 43 +++++++++++-------- 4 files changed, 55 insertions(+), 30 deletions(-) diff --git a/include/re/groups.h b/include/re/groups.h index 0c9a77fb3..6d8a0bf65 100644 --- a/include/re/groups.h +++ b/include/re/groups.h @@ -26,6 +26,8 @@ struct re_pos; * to error, otherwise the string value will be used. * Typically this would be passed as "". * + * start,end are only populated on error. + * * You can distinguish compile-time errors (that is, * syntax errors in the format string) vs. runtime errors * (that is, nonexistent groups) by calling @@ -40,7 +42,7 @@ bool re_interpolate_groups(const char *fmt, char esc, const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, char *outs, size_t outn, - struct re_pos *pos); + struct re_pos *start, struct re_pos *end); #endif diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c index 5c9752a41..8241647b3 100644 --- a/src/libre/re_interpolate_groups.c +++ b/src/libre/re_interpolate_groups.c @@ -15,15 +15,15 @@ #include // TODO -#define OUT_CHAR(c) do { if (outn < 1) { goto error; } *outs++ = (c); outn--; } while (0) -#define OUT_GROUP(s) do { if (outn < strlen((s))) { goto error; } outs += sprintf(outs, "%s", (s)); outn -= strlen((s)); } while (0) +#define OUT_CHAR(c) do { if (outn < 1) { goto overflow; } *outs++ = (c); outn--; } while (0) +#define OUT_GROUP(s) do { if (outn < strlen((s))) { goto overflow; } outs += sprintf(outs, "%s", (s)); outn -= strlen((s)); } while (0) // TODO: return values: syntax error, nonexistent group error (digit overflow is the same thing), success bool re_interpolate_groups(const char *fmt, char esc, const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, char *outs, size_t outn, - struct re_pos *pos) + struct re_pos *start, struct re_pos *end) { unsigned group; // 0 meaning group0, 1 meaning groupv[0], etc const char *p; @@ -42,7 +42,12 @@ re_interpolate_groups(const char *fmt, char esc, state = STATE_LIT; group = 0; + if (start != NULL) { + start->byte = 0; + } + p = fmt; + do { switch (state) { case STATE_LIT: @@ -51,6 +56,10 @@ re_interpolate_groups(const char *fmt, char esc, } if (*p == esc) { + if (start != NULL) { + start->byte = p - fmt; + } + state = STATE_ESC; continue; } @@ -114,6 +123,10 @@ re_interpolate_groups(const char *fmt, char esc, } if (*p == esc) { + if (start != NULL) { + start->byte = p - fmt; + } + state = STATE_ESC; continue; } @@ -125,7 +138,7 @@ re_interpolate_groups(const char *fmt, char esc, assert(!"unreached"); goto error; } - } while (*p++); + } while (*p != '\0' && p++); if (state != STATE_LIT) { goto error; @@ -135,11 +148,17 @@ re_interpolate_groups(const char *fmt, char esc, return true; +overflow: + + /* we're blaming the entire fmt string for overflow */ + if (start != NULL) { + start->byte = 0; + } + error: - // TODO: track start,end independently - if (pos != NULL) { - pos->byte = p - fmt; + if (end != NULL) { + end->byte = p - fmt; } return false; diff --git a/tests/re_interpolate_groups/re_interpolate_groups0.c b/tests/re_interpolate_groups/re_interpolate_groups0.c index 28311afb9..4b4e0f709 100644 --- a/tests/re_interpolate_groups/re_interpolate_groups0.c +++ b/tests/re_interpolate_groups/re_interpolate_groups0.c @@ -17,14 +17,13 @@ static unsigned failed; static void test(const char *fmt, size_t groupc, const char *groupv[], const char *expected) { - struct re_pos pos; char outs[40]; bool r; assert(fmt != NULL); assert(expected != NULL); - if (!re_interpolate_groups(fmt, '$', "", groupc, groupv, "", outs, sizeof outs, &pos)) { + if (!re_interpolate_groups(fmt, '$', "", groupc, groupv, "", outs, sizeof outs, NULL, NULL)) { printf("%s/%zu XXX\n", fmt, groupc); failed++; return; @@ -58,6 +57,7 @@ int main(void) { test("x$100000000000000000000x", 1, gn, "xx"); test("$$$1$1$2$1$3$4$3$2$1$$$$", 4, gn, "$oneonetwoonethreefourthreetwoone$$"); + test("$$$$$$$$$$$$$$$$$$$$", 4, gn, "$$$$$$$$$$"); test("xyz_$1..$0003;$3,$$.$1-$4=$123", 4, gn, "xyz_one..three;three,$.one-four="); test("xyz_$1..$0003;$3,$$.$1-$4=$123", 3, gn, "xyz_one..three;three,$.one-="); @@ -66,7 +66,6 @@ int main(void) { test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, g0, "xyz_..;,$.-="); test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, ga, "xyz_1..;,$.1-="); test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gb, "xyz_..;,$.-="); -// test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, gc, "xyz_..;,$.-="); return failed; } diff --git a/tests/re_interpolate_groups/re_interpolate_groups1.c b/tests/re_interpolate_groups/re_interpolate_groups1.c index c75122e1d..7c8fbf54e 100644 --- a/tests/re_interpolate_groups/re_interpolate_groups1.c +++ b/tests/re_interpolate_groups/re_interpolate_groups1.c @@ -15,26 +15,29 @@ static unsigned failed; static void -test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, unsigned expected_pos) +test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, + unsigned expected_start, unsigned expected_end) { - struct re_pos pos; + struct re_pos start, end; char outs[10]; - bool r; + bool rs, re; assert(fmt != NULL); /* for these tests we're expecting to error */ - if (re_interpolate_groups(fmt, '$', "", groupc, groupv, ne, outs, sizeof outs, &pos)) { + if (re_interpolate_groups(fmt, '$', "", groupc, groupv, ne, outs, sizeof outs, &start, &end)) { printf("%s/%zu XXX\n", fmt, groupc); failed++; return; } - failed += r = expected_pos != pos.byte; + failed += rs = expected_start != start.byte; + failed += re = expected_end != end.byte; - printf("%s/%zu => :%u :%u%s\n", fmt, groupc, - pos.byte, expected_pos, - r ? " XXX" : ""); + printf("%s/%zu => :%u-%u :%u-%u%s\n", fmt, groupc, + start.byte, end.byte, + expected_start, expected_end, + (rs || re) ? " XXX" : ""); } int main(void) { @@ -42,21 +45,23 @@ int main(void) { const char *gn[] = { "one", "two", "three", "four" }; const char **g0 = NULL; - const char *ga[] = { "1" }; - const char *gb[] = { "" }; -// const char *gc[] = { NULL }; // XXX: not permitted - test_err("$", 0, g0, ne, 1); - test_err("$x", 0, g0, ne, 1); - test_err("$ ", 4, gn, ne, 1); - test_err("$\\01", 0, g0, ne, 1); + test_err("$", 0, g0, ne, 0, 1); + test_err("$x", 0, g0, ne, 0, 1); + test_err("$ ", 4, gn, ne, 0, 1); + test_err("$\\01", 0, g0, ne, 0, 1); - test_err("$$$x", 4, gn, ne, 3); + test_err("$0$", 0, g0, ne, 2, 3); + test_err("$$$x", 4, gn, ne, 2, 3); - test_err("xyz$1", 0, gn, NULL, 5); - test_err("xyz$2", 1, gn, NULL, 5); + test_err("xyz$1", 0, gn, NULL, 3, 5); + test_err("xyz$2", 1, gn, NULL, 3, 5); - test_err("01234567890", 1, gn, ne, 10); + test_err("01234567890", 1, gn, ne, 0, 10); + test_err("$$$$$$$$$$$$$$$$$$$$", 1, gn, ne, 0, 20); + test_err("$1$1$1$$", 1, gn, ne, 0, 8); + test_err("$1$1$1x", 1, gn, ne, 0, 7); + test_err("xxxyyyzzz$$", 1, gn, ne, 0, 11); return failed; } From 1a6f0078f2f46dbb76914ccbcd74292ca0d132ad Mon Sep 17 00:00:00 2001 From: Kate F Date: Mon, 26 Jan 2026 01:55:41 +0000 Subject: [PATCH 68/80] Clarification. --- src/libre/re_interpolate_groups.c | 44 +++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c index 8241647b3..e08a57b22 100644 --- a/src/libre/re_interpolate_groups.c +++ b/src/libre/re_interpolate_groups.c @@ -14,11 +14,19 @@ #include #include -// TODO -#define OUT_CHAR(c) do { if (outn < 1) { goto overflow; } *outs++ = (c); outn--; } while (0) -#define OUT_GROUP(s) do { if (outn < strlen((s))) { goto overflow; } outs += sprintf(outs, "%s", (s)); outn -= strlen((s)); } while (0) +#define OUT_CHAR(c) \ + do { \ + if (outn < 1) { goto overflow; } \ + *outs++ = (c); outn--; \ + } while (0) + +#define OUT_GROUP(s) \ + do { \ + if (outn < strlen((s))) { goto overflow; } \ + outs += sprintf(outs, "%s", (s)); \ + outn -= strlen((s)); \ + } while (0) -// TODO: return values: syntax error, nonexistent group error (digit overflow is the same thing), success bool re_interpolate_groups(const char *fmt, char esc, const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, @@ -91,11 +99,18 @@ re_interpolate_groups(const char *fmt, char esc, group *= 10; group += *p - '0'; -// TODO: explain this -// digit overflow, we cap to groupc + 1 -// groupc + 1 is always out of bounds -// this is a simple way to avoid needing to handle digit overflow for subsequent digits, -// assuming groupc *= 10 is <= UINT_MAX + /* + * We need to handle numeric overflow somehow here, + * as we would with using strtol() or similar. But + * we don't need to distinguish this as a special + * error code, semantically it's the same as a group + * that doesn't exist. + * + * groupc + 1 is always out of bounds. So we cap to that, + * using it as a simple way to avoid needing to handle + * numeric overflow for subsequent digits. This assumes + * groupc *= 10 is <= UINT_MAX. + */ if (group > groupc) { group = groupc + 1; } @@ -108,8 +123,15 @@ re_interpolate_groups(const char *fmt, char esc, assert(groupv[group - 1] != NULL); OUT_GROUP(groupv[group - 1]); } else if (nonexistent == NULL) { -// TODO: maybe want to indicate this independently from syntax errors -// TODO: no need, you can pre-check the entire syntax by running with 0 groups + /* + * We could indicate this independently from syntax errors, + * with some way to return different error codes. + * + * But there's no need, you can pre-check the fmt syntax + * by running ahead of time with groupc == 0 and pass + * nonexistent != NULL, because that eliminates the + * possibility for group-related errors. + */ goto error; } else { OUT_GROUP(nonexistent); From cf2bb0afb050d97a83c3c08874d1edf92a8c7b1d Mon Sep 17 00:00:00 2001 From: Kate F Date: Mon, 26 Jan 2026 02:02:52 +0000 Subject: [PATCH 69/80] Fill out placeholders for writing out output. --- src/libre/re_interpolate_groups.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c index e08a57b22..8e9760eec 100644 --- a/src/libre/re_interpolate_groups.c +++ b/src/libre/re_interpolate_groups.c @@ -14,17 +14,20 @@ #include #include -#define OUT_CHAR(c) \ - do { \ - if (outn < 1) { goto overflow; } \ - *outs++ = (c); outn--; \ +#define OUT_CHAR(c) \ + do { \ + if (outn < 1) goto overflow; \ + *outs++ = (c); \ + outn--; \ } while (0) -#define OUT_GROUP(s) \ - do { \ - if (outn < strlen((s))) { goto overflow; } \ - outs += sprintf(outs, "%s", (s)); \ - outn -= strlen((s)); \ +#define OUT_GROUP(s) \ + do { \ + size_t n = strlen((s)); \ + if (outn < n) goto overflow; \ + (void) memcpy(outs, s, n); \ + outs += n; \ + outn -= n; \ } while (0) bool From 3234a7cc782c550ce8684849b0ecdce04343a795 Mon Sep 17 00:00:00 2001 From: Kate F Date: Mon, 26 Jan 2026 03:17:25 +0000 Subject: [PATCH 70/80] Convincing myself string offsets are convenient --- tests/re_interpolate_groups/re_interpolate_groups1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/re_interpolate_groups/re_interpolate_groups1.c b/tests/re_interpolate_groups/re_interpolate_groups1.c index 7c8fbf54e..5cd9a9da5 100644 --- a/tests/re_interpolate_groups/re_interpolate_groups1.c +++ b/tests/re_interpolate_groups/re_interpolate_groups1.c @@ -34,9 +34,10 @@ test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, failed += rs = expected_start != start.byte; failed += re = expected_end != end.byte; - printf("%s/%zu => :%u-%u :%u-%u%s\n", fmt, groupc, + printf("%s/%zu => :%u-%u :%u-%u '%.*s'%s\n", fmt, groupc, start.byte, end.byte, expected_start, expected_end, + (int) (end.byte - start.byte), fmt + start.byte, (rs || re) ? " XXX" : ""); } From 433c8b8a58f88897e269a335237a93f1941b27ad Mon Sep 17 00:00:00 2001 From: Kate F Date: Mon, 26 Jan 2026 13:53:25 +0000 Subject: [PATCH 71/80] Allow a NULL output string. --- include/re/groups.h | 3 ++ src/libre/re_interpolate_groups.c | 6 +-- .../re_interpolate_groups2.c | 40 +++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 tests/re_interpolate_groups/re_interpolate_groups2.c diff --git a/include/re/groups.h b/include/re/groups.h index 6d8a0bf65..57f01357a 100644 --- a/include/re/groups.h +++ b/include/re/groups.h @@ -37,6 +37,9 @@ struct re_pos; * The output string will always be less than or equal in * length to the format string. The output is \0-terminated. * outn includes the \0. + * + * outs may be NULL in which case outn must be 0, and no + * output is made. */ bool re_interpolate_groups(const char *fmt, char esc, diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c index 8e9760eec..0adef4bc3 100644 --- a/src/libre/re_interpolate_groups.c +++ b/src/libre/re_interpolate_groups.c @@ -15,14 +15,14 @@ #include #define OUT_CHAR(c) \ - do { \ + do if (outs != NULL) { \ if (outn < 1) goto overflow; \ *outs++ = (c); \ outn--; \ } while (0) #define OUT_GROUP(s) \ - do { \ + do if (outs != NULL) { \ size_t n = strlen((s)); \ if (outn < n) goto overflow; \ (void) memcpy(outs, s, n); \ @@ -48,7 +48,7 @@ re_interpolate_groups(const char *fmt, char esc, assert(esc != '\0'); assert(group0 != NULL || groupc == 0); assert(groupc < UINT_MAX / 10 - 1); - assert(outs != NULL); + assert(outs != NULL || outn == 0); state = STATE_LIT; group = 0; diff --git a/tests/re_interpolate_groups/re_interpolate_groups2.c b/tests/re_interpolate_groups/re_interpolate_groups2.c new file mode 100644 index 000000000..d33fe656a --- /dev/null +++ b/tests/re_interpolate_groups/re_interpolate_groups2.c @@ -0,0 +1,40 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include + +#include +#include + +static unsigned failed; + +static void +test(const char *fmt, bool expected) +{ + bool r; + + assert(fmt != NULL); + + r = re_interpolate_groups(fmt, '$', "", 0, NULL, "", NULL, 0, NULL, NULL); + + failed += r != expected; + + printf("%s/%d => %d%s\n", fmt, 0, r, + r != expected ? " XXX" : ""); +} + +int main(void) { + test("", true); + test("abc", true); + test("$$", true); + test("$x", false); + + return failed; +} + From 4ae257ee32cb32695f6605ad481b17b0b22f593e Mon Sep 17 00:00:00 2001 From: Kate F Date: Mon, 26 Jan 2026 23:33:48 +0000 Subject: [PATCH 72/80] Clarification. Spotted by both June and Scott, thanks --- include/re/groups.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/re/groups.h b/include/re/groups.h index 57f01357a..b89a74013 100644 --- a/include/re/groups.h +++ b/include/re/groups.h @@ -35,8 +35,12 @@ struct re_pos; * and passing a non-NULL nonexistent value. * * The output string will always be less than or equal in - * length to the format string. The output is \0-terminated. - * outn includes the \0. + * length to the format string when all interpolated + * values are the empty string. That is, when groupc is 0 + * and nonexistent is the empty string, or when all groups + * used from groupv[] are the empty string. + * + * The output is \0-terminated. outn includes the \0. * * outs may be NULL in which case outn must be 0, and no * output is made. From 0d487a68b1a3ca83acaec1b61241a573e777f1ed Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 00:45:14 +0000 Subject: [PATCH 73/80] Defensively terminate the output buffer on error. Suggested by Scott --- include/re/groups.h | 3 +++ src/libre/re_interpolate_groups.c | 7 +++++++ tests/re_interpolate_groups/re_interpolate_groups1.c | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/include/re/groups.h b/include/re/groups.h index b89a74013..cddccf6fc 100644 --- a/include/re/groups.h +++ b/include/re/groups.h @@ -44,6 +44,9 @@ struct re_pos; * * outs may be NULL in which case outn must be 0, and no * output is made. + * + * On error the function returns false and the output + * buffer is indeterminate. */ bool re_interpolate_groups(const char *fmt, char esc, diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c index 0adef4bc3..52b61f116 100644 --- a/src/libre/re_interpolate_groups.c +++ b/src/libre/re_interpolate_groups.c @@ -37,6 +37,7 @@ re_interpolate_groups(const char *fmt, char esc, struct re_pos *start, struct re_pos *end) { unsigned group; // 0 meaning group0, 1 meaning groupv[0], etc + char *outs_orig; const char *p; enum { @@ -53,6 +54,8 @@ re_interpolate_groups(const char *fmt, char esc, state = STATE_LIT; group = 0; + outs_orig = outn > 0 ? outs : NULL; + if (start != NULL) { start->byte = 0; } @@ -186,6 +189,10 @@ re_interpolate_groups(const char *fmt, char esc, end->byte = p - fmt; } + if (outs_orig != NULL) { + *outs_orig = '\0'; + } + return false; } diff --git a/tests/re_interpolate_groups/re_interpolate_groups1.c b/tests/re_interpolate_groups/re_interpolate_groups1.c index 5cd9a9da5..01dc3b344 100644 --- a/tests/re_interpolate_groups/re_interpolate_groups1.c +++ b/tests/re_interpolate_groups/re_interpolate_groups1.c @@ -24,6 +24,8 @@ test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, assert(fmt != NULL); + outs[0] = 'x'; + /* for these tests we're expecting to error */ if (re_interpolate_groups(fmt, '$', "", groupc, groupv, ne, outs, sizeof outs, &start, &end)) { printf("%s/%zu XXX\n", fmt, groupc); @@ -31,6 +33,10 @@ test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, return; } + if (outs[0] != '\0') { + failed++; + } + failed += rs = expected_start != start.byte; failed += re = expected_end != end.byte; From 82473208cb7bba31e4e6e2a2b71e065cb65665f8 Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 12:40:47 +0000 Subject: [PATCH 74/80] Update to actions/cache@v5 --- .github/workflows/ci.yml | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 157506a14..6b1c09e7d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: steps: - name: Cache checkout - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -52,7 +52,7 @@ jobs: steps: - name: Cache PCRE suite - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-pcre with: path: pcre-suite/${{ env.pcre2 }} @@ -70,7 +70,7 @@ jobs: chmod -R ug-w pcre-suite - name: Cache converted PCRE tests - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-cvtpcre with: path: ${{ env.cvtpcre }} @@ -78,7 +78,7 @@ jobs: - name: Fetch build if: steps.cache-cvtpcre.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -158,14 +158,14 @@ jobs: steps: - name: Fetch checkout - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} - name: Cache build - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -236,7 +236,7 @@ jobs: steps: - name: Fetch checkout - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -244,7 +244,7 @@ jobs: # An arbitary build. - name: Fetch build - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -325,7 +325,7 @@ jobs: steps: - name: Fetch checkout - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -347,7 +347,7 @@ jobs: ${{ matrix.cc }} --version - name: Fetch build - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -384,7 +384,7 @@ jobs: steps: - name: Fetch checkout - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -406,7 +406,7 @@ jobs: ${{ matrix.cc }} --version - name: Fetch build - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -516,14 +516,14 @@ jobs: go version - name: Fetch build - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} - name: Fetch converted PCRE tests - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-cvtpcre with: path: ${{ env.cvtpcre }} @@ -542,7 +542,7 @@ jobs: steps: - name: Cache docs - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-docs with: path: ${{ env.build }} @@ -557,7 +557,7 @@ jobs: - name: Fetch checkout if: steps.cache-docs.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -597,7 +597,7 @@ jobs: steps: - name: Cache prefix - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-prefix with: path: ${{ env.prefix }} @@ -611,7 +611,7 @@ jobs: - name: Fetch checkout if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -619,7 +619,7 @@ jobs: - name: Fetch build if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -627,7 +627,7 @@ jobs: - name: Fetch docs if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-docs with: path: ${{ env.build }} @@ -671,7 +671,7 @@ jobs: fpm -v - name: Fetch prefix - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-prefix with: path: ${{ env.prefix }} From 6d1bd9bb14556686459a735a69247f91f8e4d68a Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 12:37:46 +0000 Subject: [PATCH 75/80] fail-on-cache-miss: for grabbing arbitrary builds. --- .github/workflows/ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6b1c09e7d..5898404f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,6 +83,7 @@ jobs: with: path: ${{ env.build }} key: build-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }} # arbitrary build, just for cvtpcre + fail-on-cache-miss: true - name: Convert PCRE suite if: steps.cache-cvtpcre.outputs.cache-hit != 'true' @@ -163,6 +164,7 @@ jobs: with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Cache build uses: actions/cache@v5 @@ -241,6 +243,7 @@ jobs: with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true # An arbitary build. - name: Fetch build @@ -249,6 +252,7 @@ jobs: with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true # We don't need to build the entire repo to know that the makefiles work, # I'm just deleting a couple of .o files and rebuilding those instead. @@ -330,6 +334,7 @@ jobs: with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Dependencies (Ubuntu) if: matrix.os == 'ubuntu-22.04' @@ -389,6 +394,7 @@ jobs: with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Dependencies (Ubuntu) if: matrix.os == 'ubuntu-22.04' @@ -562,6 +568,7 @@ jobs: with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Get number of CPU cores if: steps.cache-docs.outputs.cache-hit != 'true' @@ -616,6 +623,7 @@ jobs: with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Fetch build if: steps.cache-prefix.outputs.cache-hit != 'true' From c1203e3a1c27fd4e176c4798e209e3d61753f8c0 Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 13:26:48 +0000 Subject: [PATCH 76/80] Explicitly allow build cache miss for makefile tests. --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5898404f6..8bfcf920f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -246,13 +246,17 @@ jobs: fail-on-cache-miss: true # An arbitary build. + # Failing to fetch this is not fatal, we're testing Makefiles here. + # Some combinations of our options (pmake, EXPENSIVE_CHECKS, whatever) + # won't exist in cache because we didn't build those. That's okay for + # the purposes of this step, building those is harmless. - name: Fetch build uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} - fail-on-cache-miss: true + fail-on-cache-miss: false # We don't need to build the entire repo to know that the makefiles work, # I'm just deleting a couple of .o files and rebuilding those instead. From 3f69a7c0ef4e8219286a91274268989036615579 Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 13:44:15 +0000 Subject: [PATCH 77/80] Explicitly fail-on-cache-miss for other things too. --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8bfcf920f..840bbb47d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -361,6 +361,7 @@ jobs: with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Get number of CPU cores uses: SimenB/github-actions-cpu-cores@v2 @@ -421,6 +422,7 @@ jobs: with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true # note we do the fuzzing unconditionally; each run adds to the corpus. # @@ -531,6 +533,7 @@ jobs: with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Fetch converted PCRE tests uses: actions/cache@v5 @@ -636,6 +639,7 @@ jobs: with: path: ${{ env.build }} key: build-${{ env.make }}-${{ env.os }}-${{ env.cc }}-${{ env.debug }}-${{ env.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Fetch docs if: steps.cache-prefix.outputs.cache-hit != 'true' @@ -644,6 +648,7 @@ jobs: with: path: ${{ env.build }} key: docs-${{ github.sha }} + fail-on-cache-miss: true - name: Get number of CPU cores if: steps.cache-prefix.outputs.cache-hit != 'true' @@ -688,6 +693,7 @@ jobs: with: path: ${{ env.prefix }} key: prefix-${{ env.make }}-${{ env.os }}-${{ env.cc }}-${{ env.debug }}-${{ env.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Find version # TODO: would get a tag or branch name here From 661105e9e6acdbb9b92c962f837e64e81a892918 Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 13:10:45 +0000 Subject: [PATCH 78/80] cache/restore where possible. --- .github/workflows/ci.yml | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 840bbb47d..f86f04535 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -76,9 +76,9 @@ jobs: path: ${{ env.cvtpcre }} key: cvtpcre-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }}-${{ env.pcre2 }} - - name: Fetch build + - name: Restore build if: steps.cache-cvtpcre.outputs.cache-hit != 'true' - uses: actions/cache@v5 + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} @@ -158,8 +158,8 @@ jobs: cc: gcc # -fsanitize=fuzzer is clang-only steps: - - name: Fetch checkout - uses: actions/cache@v5 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -237,8 +237,8 @@ jobs: make: pmake # not packaged steps: - - name: Fetch checkout - uses: actions/cache@v5 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -250,8 +250,8 @@ jobs: # Some combinations of our options (pmake, EXPENSIVE_CHECKS, whatever) # won't exist in cache because we didn't build those. That's okay for # the purposes of this step, building those is harmless. - - name: Fetch build - uses: actions/cache@v5 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} @@ -332,8 +332,8 @@ jobs: san: MSAN # not supported steps: - - name: Fetch checkout - uses: actions/cache@v5 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -355,8 +355,8 @@ jobs: brew install bmake pcre ${{ matrix.cc }} --version - - name: Fetch build - uses: actions/cache@v5 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} @@ -393,8 +393,8 @@ jobs: cc: gcc # it's clang anyway steps: - - name: Fetch checkout - uses: actions/cache@v5 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -416,8 +416,8 @@ jobs: brew install bmake ${{ matrix.cc }} --version - - name: Fetch build - uses: actions/cache@v5 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} @@ -527,8 +527,8 @@ jobs: sudo apt-get install golang go version - - name: Fetch build - uses: actions/cache@v5 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} @@ -568,9 +568,9 @@ jobs: sudo apt-get update sudo apt-get install bmake libxml2-utils xsltproc docbook-xml docbook-xsl - - name: Fetch checkout + - name: Restore checkout if: steps.cache-docs.outputs.cache-hit != 'true' - uses: actions/cache@v5 + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -623,27 +623,27 @@ jobs: uname -a sudo apt-get install bmake - - name: Fetch checkout + - name: Restore checkout if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v5 + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} fail-on-cache-miss: true - - name: Fetch build + - name: Restore build if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v5 + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ env.make }}-${{ env.os }}-${{ env.cc }}-${{ env.debug }}-${{ env.san }}-${{ github.sha }} fail-on-cache-miss: true - - name: Fetch docs + - name: Restore docs if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v5 + uses: actions/cache/restore@v5 id: cache-docs with: path: ${{ env.build }} @@ -687,8 +687,8 @@ jobs: sudo gem install --no-document fpm fpm -v - - name: Fetch prefix - uses: actions/cache@v5 + - name: Restore prefix + uses: actions/cache/restore@v5 id: cache-prefix with: path: ${{ env.prefix }} From c798ec1b2796ae69fe3a9d80361c14284116cb5c Mon Sep 17 00:00:00 2001 From: Kate F Date: Tue, 27 Jan 2026 16:19:26 +0000 Subject: [PATCH 79/80] Merge mishap, accidentally @v4 --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f86f04535..347d5df38 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -433,7 +433,7 @@ jobs: # still run fuzzing, just from empty, and do not save their seeds. - name: Restore seeds (mode ${{ matrix.mode }}) if: github.repository == 'katef/libfsm' - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 id: cache-seeds with: path: ${{ env.seeds }}-${{ matrix.mode }} @@ -470,7 +470,7 @@ jobs: # the same seeds for a given bug. # The explicit cache/restore and cache/save actions are just for that. - name: Save seeds (mode ${{ matrix.mode }}-${{ matrix.debug }}) - uses: actions/cache/save@v4 + uses: actions/cache/save@v5 if: always() with: path: ${{ env.seeds }}-${{ matrix.mode }} From a6e9c2aec2bd2c5ef9ca7331742037b16ed01893 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Mon, 9 Feb 2026 12:57:55 -0500 Subject: [PATCH 80/80] Makefile: grep for 'FAIL' should use -I to ignore binary files. I ran into a test failure: grep FAIL build/tests/*/*res*; [ $? -ne 0 ] grep: build/tests/eager_output/run_mixed_start_anchor_regression: binary file matches *** Error code 1 because tests/eager_output/eager_output_mixed_start_anchor_regression.c contains the substring "res" in "regression", and the binary file contains "FAIL: $ strings build/tests/eager_output/run_mixed_start_anchor_regression | grep FAIL VM_END_FAIL VM_FAIL This shouldn't matter for testing purposes, the 'test' target only cares about the test result files containing "PASS" or "FAIL". The change in cb42d58 to allow prefixed result files (e.g. "dyn-fdgetc-getc-res0") made this check ANY files with "res" in the name, but grep should ignore binary files. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4065c28b0..683e1d906 100644 --- a/Makefile +++ b/Makefile @@ -191,6 +191,6 @@ STAGE_BUILD := ${STAGE_BUILD:Nbin/cvtpcre} .if make(test) .END:: - grep FAIL ${BUILD}/tests/*/*res*; [ $$? -ne 0 ] + grep -I FAIL ${BUILD}/tests/*/*res*; [ $$? -ne 0 ] .endif