diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 157506a14..347d5df38 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: steps: - name: Cache checkout - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-checkout with: path: ${{ env.wc }} @@ -52,7 +52,7 @@ jobs: steps: - name: Cache PCRE suite - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-pcre with: path: pcre-suite/${{ env.pcre2 }} @@ -70,19 +70,20 @@ jobs: chmod -R ug-w pcre-suite - name: Cache converted PCRE tests - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-cvtpcre with: path: ${{ env.cvtpcre }} key: cvtpcre-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }}-${{ env.pcre2 }} - - name: Fetch build + - name: Restore build if: steps.cache-cvtpcre.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-bmake-${{ matrix.os }}-gcc-DEBUG-AUSAN-${{ github.sha }} # arbitrary build, just for cvtpcre + fail-on-cache-miss: true - name: Convert PCRE suite if: steps.cache-cvtpcre.outputs.cache-hit != 'true' @@ -157,15 +158,16 @@ jobs: cc: gcc # -fsanitize=fuzzer is clang-only steps: - - name: Fetch checkout - uses: actions/cache@v4 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Cache build - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-build with: path: ${{ env.build }} @@ -235,20 +237,26 @@ jobs: make: pmake # not packaged steps: - - name: Fetch checkout - uses: actions/cache@v4 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true # An arbitary build. - - name: Fetch build - uses: actions/cache@v4 + # Failing to fetch this is not fatal, we're testing Makefiles here. + # Some combinations of our options (pmake, EXPENSIVE_CHECKS, whatever) + # won't exist in cache because we didn't build those. That's okay for + # the purposes of this step, building those is harmless. + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: false # We don't need to build the entire repo to know that the makefiles work, # I'm just deleting a couple of .o files and rebuilding those instead. @@ -324,12 +332,13 @@ jobs: san: MSAN # not supported steps: - - name: Fetch checkout - uses: actions/cache@v4 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Dependencies (Ubuntu) if: matrix.os == 'ubuntu-22.04' @@ -346,12 +355,13 @@ jobs: brew install bmake pcre ${{ matrix.cc }} --version - - name: Fetch build - uses: actions/cache@v4 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Get number of CPU cores uses: SimenB/github-actions-cpu-cores@v2 @@ -383,12 +393,13 @@ jobs: cc: gcc # it's clang anyway steps: - - name: Fetch checkout - uses: actions/cache@v4 + - name: Restore checkout + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Dependencies (Ubuntu) if: matrix.os == 'ubuntu-22.04' @@ -405,12 +416,13 @@ jobs: brew install bmake ${{ matrix.cc }} --version - - name: Fetch build - uses: actions/cache@v4 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true # note we do the fuzzing unconditionally; each run adds to the corpus. # @@ -421,7 +433,7 @@ jobs: # still run fuzzing, just from empty, and do not save their seeds. - name: Restore seeds (mode ${{ matrix.mode }}) if: github.repository == 'katef/libfsm' - uses: actions/cache/restore@v4 + uses: actions/cache/restore@v5 id: cache-seeds with: path: ${{ env.seeds }}-${{ matrix.mode }} @@ -458,7 +470,7 @@ jobs: # the same seeds for a given bug. # The explicit cache/restore and cache/save actions are just for that. - name: Save seeds (mode ${{ matrix.mode }}-${{ matrix.debug }}) - uses: actions/cache/save@v4 + uses: actions/cache/save@v5 if: always() with: path: ${{ env.seeds }}-${{ matrix.mode }} @@ -515,15 +527,16 @@ jobs: sudo apt-get install golang go version - - name: Fetch build - uses: actions/cache@v4 + - name: Restore build + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ matrix.make }}-${{ matrix.os }}-${{ matrix.cc }}-${{ matrix.debug }}-${{ matrix.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Fetch converted PCRE tests - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-cvtpcre with: path: ${{ env.cvtpcre }} @@ -542,7 +555,7 @@ jobs: steps: - name: Cache docs - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-docs with: path: ${{ env.build }} @@ -555,13 +568,14 @@ jobs: sudo apt-get update sudo apt-get install bmake libxml2-utils xsltproc docbook-xml docbook-xsl - - name: Fetch checkout + - name: Restore checkout if: steps.cache-docs.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - name: Get number of CPU cores if: steps.cache-docs.outputs.cache-hit != 'true' @@ -597,7 +611,7 @@ jobs: steps: - name: Cache prefix - uses: actions/cache@v4 + uses: actions/cache@v5 id: cache-prefix with: path: ${{ env.prefix }} @@ -609,29 +623,32 @@ jobs: uname -a sudo apt-get install bmake - - name: Fetch checkout + - name: Restore checkout if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache/restore@v5 id: cache-checkout with: path: ${{ env.wc }} key: checkout-${{ github.sha }} + fail-on-cache-miss: true - - name: Fetch build + - name: Restore build if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache/restore@v5 id: cache-build with: path: ${{ env.build }} key: build-${{ env.make }}-${{ env.os }}-${{ env.cc }}-${{ env.debug }}-${{ env.san }}-${{ github.sha }} + fail-on-cache-miss: true - - name: Fetch docs + - name: Restore docs if: steps.cache-prefix.outputs.cache-hit != 'true' - uses: actions/cache@v4 + uses: actions/cache/restore@v5 id: cache-docs with: path: ${{ env.build }} key: docs-${{ github.sha }} + fail-on-cache-miss: true - name: Get number of CPU cores if: steps.cache-prefix.outputs.cache-hit != 'true' @@ -670,12 +687,13 @@ jobs: sudo gem install --no-document fpm fpm -v - - name: Fetch prefix - uses: actions/cache@v4 + - name: Restore prefix + uses: actions/cache/restore@v5 id: cache-prefix with: path: ${{ env.prefix }} key: prefix-${{ env.make }}-${{ env.os }}-${{ env.cc }}-${{ env.debug }}-${{ env.san }}-${{ github.sha }} + fail-on-cache-miss: true - name: Find version # TODO: would get a tag or branch name here diff --git a/Makefile b/Makefile index b356c4f0f..683e1d906 100644 --- a/Makefile +++ b/Makefile @@ -125,8 +125,7 @@ SUBDIR += tests/fsm SUBDIR += tests/glob SUBDIR += tests/like SUBDIR += tests/literal -# FIXME: commenting this out for now due to Makefile error -#SUBDIR += tests/lxpos +SUBDIR += tests/lxpos SUBDIR += tests/minimise SUBDIR += tests/native SUBDIR += tests/pcre @@ -137,6 +136,7 @@ SUBDIR += tests/pcre-repeat SUBDIR += tests/pred SUBDIR += tests/re_literal SUBDIR += tests/re_strings +SUBDIR += tests/regressions SUBDIR += tests/reverse SUBDIR += tests/trim SUBDIR += tests/union @@ -147,6 +147,7 @@ SUBDIR += tests/sql SUBDIR += tests/queue SUBDIR += tests/aho_corasick SUBDIR += tests/retest +SUBDIR += tests/re_interpolate_groups SUBDIR += tests .if make(theft) || make(${BUILD}/theft/theft) SUBDIR += theft @@ -190,6 +191,6 @@ STAGE_BUILD := ${STAGE_BUILD:Nbin/cvtpcre} .if make(test) .END:: - grep FAIL ${BUILD}/tests/*/res*; [ $$? -ne 0 ] + grep -I FAIL ${BUILD}/tests/*/*res*; [ $$? -ne 0 ] .endif diff --git a/README.md b/README.md index 545725fe9..d30e405b1 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ ; re -cb -pl dot '[Ll]ibf+(sm)*' '[Ll]ibre' | dot ![libfsm.svg](doc/tutorial/libfsm.svg) +libfsm is not a drop-in replacement for other regex engines, and it only supports patterns that can be compiled to deterministic FSMs. In return, supported patterns run in linear time. + Getting started: * See the [tutorial introduction](doc/tutorial/re.md) for a quick overview of the re(1) command line interface. * [Compilation phases](doc/tutorial/phases.md) for typical applications which compile regular expressions to code. + * [Advice on using libfsm](doc/advice.md) for suggestions around compilation time, unsupported features, common usage patterns, and examples. You get: diff --git a/doc/advice.md b/doc/advice.md new file mode 100644 index 000000000..0eb429f94 --- /dev/null +++ b/doc/advice.md @@ -0,0 +1,276 @@ +# Advice on using libfsm for high-performance pattern matching + +libfsm compiles regular expressions to deterministic finite state machines (FSMs) and generates executable code. FSM-based matching runs in **linear time O(n)** with **no backtracking**. + +Regex engines like PCRE use backtracking to explore multiple possible match paths at **runtime**. +This means the same pattern can have different execution costs depending on the input. + +libfsm instead resolves all match decisions at **compile time** by constructing a Deterministic Finite Automaton (DFA). +At runtime, matching is a single linear pass over the input with no alternative paths to explore. + +As a result, libfsm avoids input-dependent slowdowns and is not susceptible to regular expression–based denial-of-service (ReDoS) attacks. + +**libfsm is not a drop-in replacement for traditional regex engines.** It only supports patterns that can be compiled to FSMs. + +### **Topics** + +- [What libfsm Cannot Do](#what-libfsm-cannot-do) +- [Quick Start](#quick-start) +- [Supported Code Generation Targets](#supported-code-generation-targets) +- [Workflow Overview](#workflow-overview) +- [Writing Effective libfsm Patterns](#writing-effective-libfsm-patterns) +- [Byte Search Optimization](#byte-search-optimization-optional) +- [Troubleshooting](#troubleshooting) +- [Pattern Matches Empty String Unintentionally](#pattern-matches-empty-string-unintentionally) + +## What libfsm Cannot Do + +These PCRE features will not compile: + +* Word boundaries (`\b`) +* Non-greedy quantifiers (`*?`, `+?`, `??`) +* Group capture (coming soon!) and backreferences +* Lookahead/lookbehind assertions (`(?=`, `(?!`, `(?<=`, `(? user_detector.go +``` + +This produces a standalone matcher function. + +## Supported Code Generation Targets + +libfsm provides stable, “first-class” code generation for: +- High-level languages: C (via `-l vmc`), Go, Rust +- LLVM IR +- Native WebAssembly + +Adding code generation for new languages is straightforward and is defined in [src/libfsm/print/](../src/libfsm/print/). + +## Workflow Overview + +libfsm provides two main tools for pattern matching: + - **`re`** takes patterns from the command line + - **`rx`** takes patterns from a file + +A recommended workflow when using libfsm is: + +1. Validate the regex + + Test behavior using any PCRE-compatible tool (e.g., [pcregrep(1)](https://man7.org/linux/man-pages/man1/pcregrep.1.html) on the CLI or [https://regex101.com/](https://regex101.com/) in the browser). + +2. Verify libfsm compatibility + + If unsupported constructs exist, libfsm reports the failing location: + ```sh + re -r pcre -l ast 'x*?' + # Output: /x*?/:3: Unsupported operator + ``` + In this example, `:3` indicates that the character at byte offset three in the pattern is an unsupported feature. + + ```sh + # patterns with unsupported operators are output to declined.txt + rx -r pcre -l ast -d declined.txt 'x*?' + ``` + + +3. Generate code + + ```sh + re -p -r pcre -l rust -k str '^item-[A-Z]{3}\z' > item_detector.rs + ``` + +4. Use multiple patterns + + Execution complexity for the generated code is proportional to the length of the text being matched, not to the number of patterns. + Assuming your generated code isn't too large to compile, this means you can have as many patterns as you want, + for the same time it takes to execute a single pattern. + + Take advantage of this. + + ```sh + # re - patterns from command line: + re -p -r pcre -l go -k str '^x?a b+c$' '^x*def?$' '^x$' + + # rx - patterns from file: + rx -p -r pcre -l vmc -k str -d skipped.txt patterns.txt > detectors.c + ``` + +5. Call the generated code from your program somehow + + You're on your own for this. `-k` controls the API for the generated code to read in data to match. Try different options for the language you're using and see which suits you. + + The generated API can also vary depending on how you want libfsm to handle ambiguities between different patterns. See the `AMBIG_*` flags in [include/fsm/options.h](../include/fsm/options.h) for different approaches there. + +Both tools: +* Combine all patterns into one function (like using `|` to join them) +* Generate code that can return `(bool, int)` for the match status and pattern ID +* Pattern ID is argument position for `re`, line number for `rx` +* When encountering unsupported patterns: `rx` can decline them to `-d` file and generates code with working patterns only; `re` fails completely + +### Common Flags + +| Flag | Purpose | Common Options | Notes | +|:----:|:---------------------------- |:------------------------------------------ |:---------------------------------------------------------------- | +| `-r` | Regex dialect | `pcre`, `literal`, `glob`, `native`, `sql` | `pcre` supports the widest set of features | +| `-l` | Output language for printing | `go`, `rust`, `vmc`, `llvm`, `wasm`, `dot` | Use `vmc` for `C` code. Pipe `dot` into `idot` for visualization | +| `-k` | Generated function I/O API | `str`, `getc`, `pair` | `str` takes string, `pair` takes byte array, `getc` uses callback for streaming | +| `-p` | Print mode | *(no value)* | Abbrv. of `-l fsm`. Print the constructed fsm, rather than executing it. | +| `-d` | Declined patterns | filename | Only applies to `rx` (batch mode) | + +This is not an exhaustive list. For full flag details, see [include/fsm/options.h](../include/fsm/options.h) and the [man pages](../man). +The man pages can be built by running `bmake -r doc`, then view with `man build/man/re.1/re.1`. + +## Writing Effective libfsm Patterns + +Generally, to keep generated code compact, stick to the least expressive subset of features. + +libfsm has no way to know in advance what text you'll be passing to its generated code. +For example, are you matching a string that you know will never contain a newline? +libfsm doesn't know that. +It has to generate code that's capable of handling any input. +You can help it out by making your patterns precise. + +Think about what you intend your pattern to match, and what it's actually capable of matching given arbitrary text. +This helps restrict the scope of your pattern from arbitrary text to exactly what you mean. +The following bits of advice illustrate various specific ways to bring down this scope. + +1. Replace broad wildcards + + Avoid `.*` and `.+` when possible. Wildcards match “anything,” which is often imprecise. And although they look compact, libfsm must enumerate every possible byte and continuation. This quickly leads to large DFAs. + + For example, a double-quoted string should not use `".*"` because the content cannot contain an unescaped quote. Using `.*` forces libfsm to consider all characters -- including both the presence and absence of the closing `"` at every step. This greatly increases the number of states. + + Instead, restrict it to the actual valid characters `"[^"\r\n]*"`, which matches only what is allowed and will keep the DFA more compact. + + Use negated character classes to match only the allowed content: + + | Avoid | Better | + | ---------- | -------------- | + | `<.*>` | `<[^>]*>` | + | `\((.*)\)` | `\([^)]*\)`| + | `price=.+` | `price=[0-9]+` | + | `var\s.+=` | `var\s[^=]+=` | + + The overlap between `.*` or `.+` and strings that follow is often the cause of an “explosion” in the size of the generated FSM. So when compilation is slow or generated output is large, look for `.*` and `.+` first and replace them with a narrower character class. + +2. Take care with bounded repetition + + If you have the pattern `^x{3,5}$`, libfsm's resulting DFA will be structured like "match an x, then match an x, then match an x, then match an x or skip it, then match an x or skip it, then report an overall match if at the end of input". It has to repeat the pattern, noting each time whether it's required or optional (beyond the lower count in `{min,max}`), because DFA execution doesn't have a counter, just the current state within the overall DFA. + + When the subexpression (represented by `x`) unintentionally matches too many things, they all have to be spelled out every time. + So pay especially close attention to tightening up subexpressions in bounded repetition clauses. + +3. Anchor when matching full string + + When the intention is to match an entire string, use anchors. + Use `^` at the beginning and `\z` for the true end of the string. + + ```regex + # Correct: matches only this exact hostname + # Matches "web12.example.com" + # Does not match "foo-web12.example.com-bar" + ^web\d+\.example\.com\z  + + # Incorrect: would match inside a larger string + # Matches "web12.example.com" + # Also matches "foo-web12.example.com-bar" + web\d+\.example\.com + ``` + +4. Prefer `\z` over `$` for End-of-String + + `\z` always matches the end of the string. + `$` will also match a trailing newline at the end of the string, + so if you use this in combination with capturing groups, you may not be capturing what you expect. + Also, `\z` produces a smaller FSM, so it is better to use it in places where `\n` cannot appear. + + ```regex + # Preferred: matches only if the string ends with "bar" + # Matches "/foo/bar" + # Does NOT match "/foo/bar\n" + /bar\z + + # Incorrect: allows a trailing newline, + # which is usually unintended and adds unnecessary complexity + # Matches "/foo/bar" + # Also matches "/foo/bar\n" + /bar$ + ``` + +5. Escape special characters when used as literals + + Many characters have special meaning in regex (for example `.`, `+`, `*`, `?`, `[`, `(`). + If you mean to match them literally, escape them: + + | Literal You Want | Correct Regex | Explanation | + |----------------------------|-----------------------------|--------------------------------------------| + | `example.com` | `example\.com` | `.` matches any character unless escaped | + | `a+b` | `a\+b` | `+` means “one or more” | + | `price?` | `price\?` | `?` means “optional” | + | `[value]` | `\[value\]` | `[` and `]` start/end a character class | + | `(test)` | `\(test\)` | `(` and `)` begin/end a group | + | Markdown link `[t](u)` | `(\[[^]]*\]\([^)]*\))` | Matches `[text](url)` without crossing `]` or `)` | + + The `.` wildcard in particular is often mistakenly left unescaped in practice. + On testing, it will match a literal `.` as intended. But it will also match any other character. + This means that not only is your pattern incorrect (write negative test cases!), + but also this part of your FSM is 256 times larger than it should be. + +6. Use non-capturing groups + + Capture groups are _currently_ not supported (coming soon!). + + If you don't need to capture things, don't use capture. + If you need grouping for alternation or precedence, use PCRE's non-capturing syntax `(?:...)`: + + ```regex + # Correct + (?:private|no-store) + + # Not what's intended + (private|no-store) + ``` + +## Byte Search Optimization + +Patterns that start with an uncommon character can be accelerated using an initial byte scan before running the FSM. +This quickly jumps to likely match positions instead of scanning every byte. + +Good candidates are patterns that start with uncommon prefix characters, for example: + +```regex +#tag-[a-z]+ +@user-[0-9]+ +\[section\] +{"key": +"name='[^']+'" +``` + +These prefixes (`#`, `@`, `[`, `{`, `'`, `"`) are rare in normal text, so a byte search can skip ahead before running the matcher. + +We found using `strings.IndexByte` before calling the generated matcher in Go code significantly improved performance when matching strings with a large (>5k) leading prefix. + +## Pattern Matches Empty String Unintentionally + +Pattern: + +```regex +\s* +``` + +Will compile to code that always returns true. + +This is only an issue if that is not what you intend. + +**Fix options:** + +* Require at least one match: `\s+` +* Anchor context: `^\s+$` or alternatively, use `-Fb` flag diff --git a/fuzz/target.c b/fuzz/target.c index b7e0f3f7b..fd59aeea4 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -422,11 +422,6 @@ fsm_eager_output_dump(FILE *f, const struct fsm *fsm); static int fuzz_eager_output(const uint8_t *data, size_t size) { - if (size > 0) { - const unsigned seed = data[0]; - srand(seed); - } - struct feo_env env = { .ok = true, .pattern_count = 0, @@ -451,6 +446,8 @@ fuzz_eager_output(const uint8_t *data, size_t size) size_t max_pattern_length = 0; + const unsigned seed = size == 0 ? 0 : data[0]; + /* chop data into a series of patterns */ { size_t prev = 0; @@ -526,9 +523,14 @@ fuzz_eager_output(const uint8_t *data, size_t size) continue; /* invalid regex */ } + const fsm_output_id_t endid = (fsm_output_id_t)p_i; + ret = fsm_eager_output_set_on_ends(fsm, endid); + assert(ret == 1); + if (verbose) { fprintf(stderr, "==== pattern %zd, pre det\n", p_i); fsm_dump(stderr, fsm); + fsm_eager_output_dump(stderr, fsm); fprintf(stderr, "====\n"); fsm_state_t c = fsm_countstates(fsm); @@ -537,6 +539,12 @@ fuzz_eager_output(const uint8_t *data, size_t size) } } + ret = fsm_determinise(fsm); + assert(ret == 1); + + ret = fsm_minimise(fsm); + assert(ret == 1); + fsm_state_t start; if (!fsm_getstart(fsm, &start)) { fsm_free(fsm); @@ -599,7 +607,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) goto cleanup; /* nothing to do */ } - /* consumes entries[] */ + /* consumes nfas[] */ struct fsm *fsm = fsm_union_repeated_pattern_group(used, nfas, NULL, 0); assert(fsm != NULL); @@ -636,7 +644,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) * Use the combined DFA to generate matches, check that the * match behavior agrees with the individual DFA copies. */ env.current_pattern = (size_t)-1; - if (!fsm_generate_matches(env.combined, max_pattern_length, 1, gen_combined_check_individual_cb, &env)) { + if (!fsm_generate_matches(env.combined, max_pattern_length, seed, gen_combined_check_individual_cb, &env)) { goto cleanup; } @@ -646,7 +654,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) /* check behavior against the combined DFA. */ for (size_t i = 0; i < env.pattern_count; i++) { env.current_pattern = i; - if (!fsm_generate_matches(env.combined, max_pattern_length, 1, gen_individual_check_combined_cb, &env)) { + if (!fsm_generate_matches(env.combined, max_pattern_length, seed, gen_individual_check_combined_cb, &env)) { goto cleanup; } } diff --git a/include/fsm/bool.h b/include/fsm/bool.h index c2c2d80ed..150d63f5d 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -57,8 +57,12 @@ fsm_union_array(size_t fsm_count, * eager outputs can match. Ownership of the NFAs is transferred, they will * be combined (or freed, if they don't have a start state). * - * This MUST be called with NFAs constructed via re_comp, Calling it with - * manually constructed NFAs or DFAs is unsupported. + * This must be called with NFAs constructed via re_comp, using its + * RE_SAVE_LINKAGE_INFO flag. That saves details during construction + * that are necessary to correctly handle anchoring while linking + * them into the combined NFA. If any of the NFAs do not have that + * information populated, the whole set will be rejected and it + * will return NULL. * * This will set end IDs and/or output IDs representing matching each * of the original NFAs on the combined result, where nfas[i] will diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index f78d91d71..0f08bfd05 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -270,36 +270,48 @@ fsm_mapendids(struct fsm * fsm, fsm_endid_remap_fun remap, void *opaque); void fsm_increndids(struct fsm * fsm, int delta); -/* Associate an eagerly matched numeric ID with the end states in an fsm. - * - * This is similar to fsm_setendid, but has different performance - * trade-offs. In particular, it can become extremely expensive to - * combine multiple DFAs with endids on their end states when they - * representing regexes with unanchored ends, because the FSM has to - * explicitly represent all the possible combinations of matches by - * copying the entire path to every reachable end state. Eager endids - * are associated with the edge leaving the main pattern match. - * - * Returns 1 on success, 0 on error. - * */ -int -fsm_seteagerendid(struct fsm *fsm, fsm_end_id_t id); - /* Set an eager output ID to emit every time the state is entered. - * This turns the automata into a Moore machine. */ + * This is similar to fsm_setendid, but has different performance + * trade-offs for determinisation, and can be applied to + * non-end states. + * + * During DFA execution, states with eager outputs will output their + * ID when output reaches them. With fsm_exec, this happens via a + * callback (see fsm_eager_output_set_cb). Some print languages + * will eventually support eager outputs. + * + * One use case for eager outputs is combining multiple unanchored + * regexes into a single DFA and detecting when input matches more than + * one of them. With endids, determinisation has to represent every + * possible reachable combination of endids as a distinct copy of the + * DFA subgraph, leading to a combinatorial explosion that makes + * combining more than a 8 or so regexes (even very simple ones) + * prohibitively expensive. With eager outputs, the graph no longer + * needs a separate subgraph copy for each combination of IDs, so it is + * possible to combine several dozen or even hundreds of FSMs into a + * single DFA. See fsm_union_repeated_pattern_group for more details. */ int -fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); +fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id); /* Set an eager output ID on all current end states. */ int -fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id); +fsm_eager_output_set_on_ends(struct fsm *fsm, fsm_output_id_t id); -/* HACK */ +/* Callback for eager output processing. + * If set (using fsm_eager_output_set_cb), this may be called while fsm_exec runs. */ typedef void fsm_eager_output_cb(fsm_output_id_t id, void *opaque); + +/* Set a callback and opaque argument on an FSM for eager outputs encountered + * while fsm_exec is running. Rather than adding another pair of arguments to + * fsm_exec, this is called as a separate step -- most DFAs will not use eager + * outputs, or use them with code generation rather than fsm_exec. + * + * See fsm_eager_output_set for more details about eager output functionality. */ void fsm_eager_output_set_cb(struct fsm *fsm, fsm_eager_output_cb *cb, void *opaque); +/* Get the eager output callback set on a FSM and its opaque pointer, if any. */ void fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void **opaque); @@ -307,11 +319,39 @@ fsm_eager_output_get_cb(const struct fsm *fsm, fsm_eager_output_cb **cb, void ** size_t fsm_eager_output_count(const struct fsm *fsm, fsm_state_t state); -/* Get eager output associated with a state. It's expected that buf[] has - * sufficient space -- call fsm_eager_output_count first to get the count. - * The contents of buf will be sorted and unique. */ -void -fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, fsm_output_id_t *buf); +/* Get eager output IDs associated with a state, if any. + * id_buf is expected to have enough cells (according to id_buf_count) + * to store all the end IDs. You can find this with fsm_eager_output_count(). + * + * The IDs in the buffer are sorted and do not have duplicates. + * + * Unlike end IDs, eager outputs can appear on states that are + * not marked as end states. + * + * Returns 0 if there is not enough space in id_buf for the + * eager output IDs, or 1 if zero more IDs were returned. */ +int +fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, + size_t buf_count, fsm_output_id_t *id_buf); + +/* Get the end IDs associated with an end state, if any. + * id_buf is expected to have enough cells (according to id_buf_count) + * to store all the end IDs. You can find this with fsm_endid_count(). + * + * The end IDs in the buffer are sorted and do not have duplicates. + * + * A state with no end IDs set is considered equivalent to a state + * that has the empty set, this API does not distinguish these cases. + * This is not an error. + * + * It is an error to attempt to get end IDs associated with a state + * that is not marked as an end state. + * + * Returns 0 if there is not enough space in id_buf for the + * end IDs, or 1 if zero or more end IDs were returned. */ +int +fsm_endid_get(const struct fsm *fsm, fsm_state_t end_state, + size_t id_buf_count, fsm_end_id_t *id_buf); /* * Find the state (if there is just one), or add epsilon edges from all states, diff --git a/include/fsm/print.h b/include/fsm/print.h index 3df5db304..9921ffa30 100644 --- a/include/fsm/print.h +++ b/include/fsm/print.h @@ -80,8 +80,15 @@ struct fsm_hooks { void *lang_opaque, void *hook_opaque); int (*reject)(FILE *, const struct fsm_options *opt, + const struct fsm_state_metadata *state_metadata, void *lang_opaque, void *hook_opaque); + /* If non-NULL, this will be called to generate code + * in scope immediately after advancing to the + * next character of input. */ + int (*advance)(FILE *, const struct fsm_options *opt, + const char *cur_char_var, void *hook_opaque); + int (*comment)(FILE *, const struct fsm_options *opt, const struct fsm_state_metadata *state_metadata, void *hook_opaque); diff --git a/include/re/groups.h b/include/re/groups.h new file mode 100644 index 000000000..cddccf6fc --- /dev/null +++ b/include/re/groups.h @@ -0,0 +1,58 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#ifndef RE_GROUPS_H +#define RE_GROUPS_H + +struct re_pos; + +/* + * esc is the character for escaping group references, + * typically '\\' or '$'. + * + * group0 is passed separately for caller convenience, + * so you don't have to construct a single array for + * all groups. It's supposed to be the entire string + * that matched. group0 may not be NULL. + * + * groupv is 0-indexed meaning group $1 onwards. + * groupc is the count of elements in groupv. + * + * nonexistent is what to do about references to groups + * that are outside the bounds of the array. NULL means + * to error, otherwise the string value will be used. + * Typically this would be passed as "". + * + * start,end are only populated on error. + * + * You can distinguish compile-time errors (that is, + * syntax errors in the format string) vs. runtime errors + * (that is, nonexistent groups) by calling + * re_interpolate_groups() ahead of time with groupc = 0 + * and passing a non-NULL nonexistent value. + * + * The output string will always be less than or equal in + * length to the format string when all interpolated + * values are the empty string. That is, when groupc is 0 + * and nonexistent is the empty string, or when all groups + * used from groupv[] are the empty string. + * + * The output is \0-terminated. outn includes the \0. + * + * outs may be NULL in which case outn must be 0, and no + * output is made. + * + * On error the function returns false and the output + * buffer is indeterminate. + */ +bool +re_interpolate_groups(const char *fmt, char esc, + const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, + char *outs, size_t outn, + struct re_pos *start, struct re_pos *end); + +#endif + diff --git a/include/re/re.h b/include/re/re.h index 841e4e946..69551d39f 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -29,6 +29,9 @@ enum re_flags { RE_ANCHORED = 1 << 6, RE_EXTENDED = 1 << 7, /* PCRE extended mode */ RE_END_NL = 1 << 8, /* end anchor matches '\n' */ + /* save info about linkage at construction time, to inform + * later operations -- see fsm_union_repeated_pattern_group */ + RE_SAVE_LINKAGE_INFO = 1 << 9, RE_FLAGS_NONE = 0 }; diff --git a/man/fsm.1/fsm.1.xml b/man/fsm.1/fsm.1.xml index 776aaaab9..caec5309f 100644 --- a/man/fsm.1/fsm.1.xml +++ b/man/fsm.1/fsm.1.xml @@ -22,6 +22,7 @@ io"> iterations"> length"> + limit"> charset"> -a"> @@ -33,7 +34,7 @@ -G &length.arg;"> -k &io.arg;"> -i &iterations.arg;"> - -S &limit.arg;"> + -S &limit.arg;"> -U &charset.arg;"> -X"> diff --git a/man/rx.1/rx.1.xml b/man/rx.1/rx.1.xml index a9f1ffd18..01956b47a 100644 --- a/man/rx.1/rx.1.xml +++ b/man/rx.1/rx.1.xml @@ -44,6 +44,7 @@ -v"> -w"> -X"> + -x"> -h"> ]> @@ -403,29 +404,20 @@ &u.opt; - Allow ambiguities. - This means patterns with different ids may match the same text. - The default is to error for conflicts. - - It's possible to have multiple patterns with the same id - (i.e. by being in the same file when using multi-file mode), - and these are not considered a conflict because they key - to the same id. - - - - - &u.opt; + Allow ambiguities between patterns. + This means patterns with different ids may match the same text. - - Allow ambiguities between regexps, - such that multiple regexps may match the same text. - The default is to error for ambiguities, + The default is to error for ambiguities, requiring all regexps unioned to be non-overlapping. Formally, the requirement is that they are disjoint languages. Erroring for ambiguities applies after multiple regexps are joined, either by union or by concatenation (&s.opt;). + It's possible to have multiple patterns with the same id + (i.e. by being in the same file when using multi-file mode), + and these are not considered a conflict because they key + to the same id. + &u.opt; is implied by &n.opt;. @@ -462,6 +454,17 @@ + + &x.opt; + + + Literals are unanchored. + This applies to all literals; for finer control use a regex dialect. + The default is that literals are anchored, + as if written ^abc$ in regex syntax. + + + &h.opt; diff --git a/share/ucd/CaseFolding.txt b/share/ucd/CaseFolding.txt index 1b7a9c156..a0b0f07fd 100644 --- a/share/ucd/CaseFolding.txt +++ b/share/ucd/CaseFolding.txt @@ -1,6 +1,6 @@ -# CaseFolding-16.0.0.txt -# Date: 2024-04-30, 21:48:11 GMT -# © 2024 Unicode®, Inc. +# CaseFolding-17.0.0.txt +# Date: 2025-07-30, 23:54:36 GMT +# © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # @@ -18,15 +18,15 @@ # The data supports both implementations that require simple case foldings # (where string lengths don't change), and implementations that allow full case folding # (where string lengths may grow). Note that where they can be supported, the -# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# full case foldings are superior: for example, they allow "FUSS" and "Fuß" to match. # # All code points not listed in this file map to themselves. # # NOTE: case folding does not preserve normalization formats! # # For information on case folding, including how to have case folding -# preserve normalization formats, see Section 3.13 Default Case Algorithms in -# The Unicode Standard. +# preserve normalization formats, see the +# "Conformance" / "Default Case Algorithms" section of the core specification. # # ================================================================================ # Format @@ -1243,7 +1243,10 @@ A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY A7CB; C; 0264; # LATIN CAPITAL LETTER RAMS HORN A7CC; C; A7CD; # LATIN CAPITAL LETTER S WITH DIAGONAL STROKE +A7CE; C; A7CF; # LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D2; C; A7D3; # LATIN CAPITAL LETTER DOUBLE THORN +A7D4; C; A7D5; # LATIN CAPITAL LETTER DOUBLE WYNN A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S A7DA; C; A7DB; # LATIN CAPITAL LETTER LAMBDA @@ -1616,6 +1619,31 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z 16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O 16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI 16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +16EA0; C; 16EBB; # BERIA ERFE CAPITAL LETTER ARKAB +16EA1; C; 16EBC; # BERIA ERFE CAPITAL LETTER BASIGNA +16EA2; C; 16EBD; # BERIA ERFE CAPITAL LETTER DARBAI +16EA3; C; 16EBE; # BERIA ERFE CAPITAL LETTER EH +16EA4; C; 16EBF; # BERIA ERFE CAPITAL LETTER FITKO +16EA5; C; 16EC0; # BERIA ERFE CAPITAL LETTER GOWAY +16EA6; C; 16EC1; # BERIA ERFE CAPITAL LETTER HIRDEABO +16EA7; C; 16EC2; # BERIA ERFE CAPITAL LETTER I +16EA8; C; 16EC3; # BERIA ERFE CAPITAL LETTER DJAI +16EA9; C; 16EC4; # BERIA ERFE CAPITAL LETTER KOBO +16EAA; C; 16EC5; # BERIA ERFE CAPITAL LETTER LAKKO +16EAB; C; 16EC6; # BERIA ERFE CAPITAL LETTER MERI +16EAC; C; 16EC7; # BERIA ERFE CAPITAL LETTER NINI +16EAD; C; 16EC8; # BERIA ERFE CAPITAL LETTER GNA +16EAE; C; 16EC9; # BERIA ERFE CAPITAL LETTER NGAY +16EAF; C; 16ECA; # BERIA ERFE CAPITAL LETTER OI +16EB0; C; 16ECB; # BERIA ERFE CAPITAL LETTER PI +16EB1; C; 16ECC; # BERIA ERFE CAPITAL LETTER ERIGO +16EB2; C; 16ECD; # BERIA ERFE CAPITAL LETTER ERIGO TAMURA +16EB3; C; 16ECE; # BERIA ERFE CAPITAL LETTER SERI +16EB4; C; 16ECF; # BERIA ERFE CAPITAL LETTER SHEP +16EB5; C; 16ED0; # BERIA ERFE CAPITAL LETTER TATASOUE +16EB6; C; 16ED1; # BERIA ERFE CAPITAL LETTER UI +16EB7; C; 16ED2; # BERIA ERFE CAPITAL LETTER WASSE +16EB8; C; 16ED3; # BERIA ERFE CAPITAL LETTER AY 1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF 1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI 1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM diff --git a/share/ucd/Makefile b/share/ucd/Makefile index 833910330..22aa0a869 100644 --- a/share/ucd/Makefile +++ b/share/ucd/Makefile @@ -1,4 +1,4 @@ -UCD_URL ?= https://www.unicode.org/Public/16.0.0/ucd/ +UCD_URL ?= https://www.unicode.org/Public/17.0.0/ucd/ WGET ?= wget diff --git a/share/ucd/Scripts.txt b/share/ucd/Scripts.txt index 443a6d2dd..5574fdd6a 100644 --- a/share/ucd/Scripts.txt +++ b/share/ucd/Scripts.txt @@ -1,6 +1,6 @@ -# Scripts-16.0.0.txt -# Date: 2024-04-30, 21:48:40 GMT -# © 2024 Unicode®, Inc. +# Scripts-17.0.0.txt +# Date: 2025-07-24, 13:28:55 GMT +# © 2025 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/terms_of_use.html # @@ -154,7 +154,7 @@ 208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS -20A0..20C0 ; Common # Sc [33] EURO-CURRENCY SIGN..SOM SIGN +20A0..20C1 ; Common # Sc [34] EURO-CURRENCY SIGN..SAUDI RIYAL SIGN 2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2102 ; Common # L& DOUBLE-STRUCK CAPITAL C 2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA @@ -306,8 +306,7 @@ 2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR 2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR -2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW -2B97..2BFF ; Common # So [105] SYMBOL FOR TYPE A ELECTRONICS..HELLSCHREIBER PAUSE SYMBOL +2B76..2BFF ; Common # So [138] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..HELLSCHREIBER PAUSE SYMBOL 2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET @@ -524,7 +523,11 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR 1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1CC00..1CCEF ; Common # So [240] UP-POINTING GO-KART..OUTLINED LATIN CAPITAL LETTER Z 1CCF0..1CCF9 ; Common # Nd [10] OUTLINED DIGIT ZERO..OUTLINED DIGIT NINE +1CCFA..1CCFC ; Common # So [3] SNAKE SYMBOL..NOSE SYMBOL 1CD00..1CEB3 ; Common # So [436] BLOCK OCTANT-3..BLACK RIGHT TRIANGLE CARET +1CEBA..1CED0 ; Common # So [23] FRAGILE SYMBOL..LEUKOTHEA +1CEE0..1CEEF ; Common # So [16] GEOMANTIC FIGURE POPULUS..GEOMANTIC FIGURE VIA +1CEF0 ; Common # Sm MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR 1CF50..1CFC3 ; Common # So [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK 1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 @@ -605,11 +608,10 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR 1F260..1F265 ; Common # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI 1F300..1F3FA ; Common # So [251] CYCLONE..AMPHORA 1F3FB..1F3FF ; Common # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 -1F400..1F6D7 ; Common # So [728] RAT..ELEVATOR +1F400..1F6D8 ; Common # So [729] RAT..LANDSLIDE 1F6DC..1F6EC ; Common # So [17] WIRELESS..AIRPLANE ARRIVING 1F6F0..1F6FC ; Common # So [13] SATELLITE..ROLLER SKATE -1F700..1F776 ; Common # So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE -1F77B..1F7D9 ; Common # So [95] HAUMEA..NINE POINTED WHITE STAR +1F700..1F7D9 ; Common # So [218] ALCHEMICAL SYMBOL FOR QUINTESSENCE..NINE POINTED WHITE STAR 1F7E0..1F7EB ; Common # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE 1F7F0 ; Common # So HEAVY EQUALS SIGN 1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD @@ -619,21 +621,24 @@ FFFC..FFFD ; Common # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHAR 1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS 1F8B0..1F8BB ; Common # So [12] ARROW POINTING UPWARDS THEN NORTH WEST..SOUTH WEST ARROW FROM BAR 1F8C0..1F8C1 ; Common # So [2] LEFTWARDS ARROW FROM DOWNWARDS ARROW..RIGHTWARDS ARROW FROM DOWNWARDS ARROW -1F900..1FA53 ; Common # So [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP +1F8D0..1F8D8 ; Common # Sm [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE +1F900..1FA57 ; Common # So [344] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS ALFIL 1FA60..1FA6D ; Common # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER 1FA70..1FA7C ; Common # So [13] BALLET SHOES..CRUTCH -1FA80..1FA89 ; Common # So [10] YO-YO..HARP -1FA8F..1FAC6 ; Common # So [56] SHOVEL..FINGERPRINT -1FACE..1FADC ; Common # So [15] MOOSE..ROOT VEGETABLE -1FADF..1FAE9 ; Common # So [11] SPLATTER..FACE WITH BAGS UNDER EYES -1FAF0..1FAF8 ; Common # So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND +1FA80..1FA8A ; Common # So [11] YO-YO..TROMBONE +1FA8E..1FAC6 ; Common # So [57] TREASURE CHEST..FINGERPRINT +1FAC8 ; Common # So HAIRY CREATURE +1FACD..1FADC ; Common # So [16] ORCA..ROOT VEGETABLE +1FADF..1FAEA ; Common # So [12] SPLATTER..DISTORTED FACE +1FAEF..1FAF8 ; Common # So [10] FIGHT CLOUD..RIGHTWARDS PUSHING HAND 1FB00..1FB92 ; Common # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK 1FB94..1FBEF ; Common # So [92] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..TOP LEFT JUSTIFIED LOWER RIGHT QUARTER BLACK CIRCLE 1FBF0..1FBF9 ; Common # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE +1FBFA ; Common # So ALARM BELL SYMBOL E0001 ; Common # Cf LANGUAGE TAG E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG -# Total code points: 9053 +# Total code points: 9123 # ================================================ @@ -648,8 +653,8 @@ E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG 01BC..01BF ; Latin # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN 01C0..01C3 ; Latin # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK 01C4..0293 ; Latin # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL -0294 ; Latin # Lo LATIN LETTER GLOTTAL STOP -0295..02AF ; Latin # L& [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL +0294..0295 ; Latin # Lo [2] LATIN LETTER GLOTTAL STOP..LATIN LETTER PHARYNGEAL VOICED FRICATIVE +0296..02AF ; Latin # L& [26] LATIN LETTER INVERTED GLOTTAL STOP..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 02B0..02B8 ; Latin # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y 02E0..02E4 ; Latin # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP 1D00..1D25 ; Latin # L& [38] LATIN LETTER SMALL CAPITAL A..LATIN LETTER AIN @@ -676,11 +681,8 @@ A770 ; Latin # Lm MODIFIER LETTER US A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT A78F ; Latin # Lo LATIN LETTER SINOLOGICAL DOT -A790..A7CD ; Latin # L& [62] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH DIAGONAL STROKE -A7D0..A7D1 ; Latin # L& [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G -A7D3 ; Latin # L& LATIN SMALL LETTER DOUBLE THORN -A7D5..A7DC ; Latin # L& [8] LATIN SMALL LETTER DOUBLE WYNN..LATIN CAPITAL LETTER LAMBDA WITH STROKE -A7F2..A7F4 ; Latin # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q +A790..A7DC ; Latin # L& [77] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER LAMBDA WITH STROKE +A7F1..A7F4 ; Latin # Lm [4] MODIFIER LETTER CAPITAL S..MODIFIER LETTER CAPITAL Q A7F5..A7F6 ; Latin # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE @@ -702,7 +704,7 @@ FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN 1DF0B..1DF1E ; Latin # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL 1DF25..1DF2A ; Latin # L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK -# Total code points: 1487 +# Total code points: 1492 # ================================================ @@ -869,7 +871,7 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU 0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE 0870..0887 ; Arabic # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT 0888 ; Arabic # Sk ARABIC RAISED ROUND DOT -0889..088E ; Arabic # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL +0889..088F ; Arabic # Lo [7] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC LETTER NOON WITH RING ABOVE 0890..0891 ; Arabic # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE 0897..089F ; Arabic # Mn [9] ARABIC PEPET..ARABIC HALF MADDA OVER MADDA 08A0..08C8 ; Arabic # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF @@ -878,11 +880,13 @@ FB46..FB4F ; Hebrew # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATU 08E3..08FF ; Arabic # Mn [29] ARABIC TURNED DAMMA BELOW..ARABIC MARK SIDEWAYS NOON GHUNNA FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBB2..FBC2 ; Arabic # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE +FBC3..FBD2 ; Arabic # So [16] ARABIC LIGATURE JALLA WA-ALAA..ARABIC LIGATURE ALAYHI AR-RAHMAH FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM FD40..FD4F ; Arabic # So [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH FD50..FD8F ; Arabic # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM +FD90..FD91 ; Arabic # So [2] ARABIC LIGATURE RAHMATU ALLAAHI ALAYH..ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM -FDCF ; Arabic # So ARABIC LIGATURE SALAAMUHU ALAYNAA +FDC8..FDCF ; Arabic # So [8] ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA..ARABIC LIGATURE SALAAMUHU ALAYNAA FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FDFC ; Arabic # Sc RIAL SIGN FDFD..FDFF ; Arabic # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL @@ -890,7 +894,11 @@ FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS 10EC2..10EC4 ; Arabic # Lo [3] ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW -10EFC..10EFF ; Arabic # Mn [4] ARABIC COMBINING ALEF OVERLAY..ARABIC SMALL LOW WORD MADDA +10EC5 ; Arabic # Lm ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW +10EC6..10EC7 ; Arabic # Lo [2] ARABIC LETTER THIN NOON..ARABIC LETTER YEH WITH FOUR DOTS BELOW +10ED0 ; Arabic # Po ARABIC BIBLICAL END OF VERSE +10ED1..10ED8 ; Arabic # So [8] ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM..ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH +10EFA..10EFF ; Arabic # Mn [6] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW WORD MADDA 1EE00..1EE03 ; Arabic # Lo [4] ARABIC MATHEMATICAL ALEF..ARABIC MATHEMATICAL DAL 1EE05..1EE1F ; Arabic # Lo [27] ARABIC MATHEMATICAL WAW..ARABIC MATHEMATICAL DOTLESS QAF 1EE21..1EE22 ; Arabic # Lo [2] ARABIC MATHEMATICAL INITIAL BEH..ARABIC MATHEMATICAL INITIAL JEEM @@ -926,7 +934,7 @@ FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LA 1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL -# Total code points: 1373 +# Total code points: 1413 # ================================================ @@ -1155,7 +1163,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY 0C4A..0C4D ; Telugu # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA 0C55..0C56 ; Telugu # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C58..0C5A ; Telugu # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA -0C5D ; Telugu # Lo TELUGU LETTER NAKAARA POLLU +0C5C..0C5D ; Telugu # Lo [2] TELUGU ARCHAIC SHRII..TELUGU LETTER NAKAARA POLLU 0C60..0C61 ; Telugu # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL 0C62..0C63 ; Telugu # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL 0C66..0C6F ; Telugu # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE @@ -1163,7 +1171,7 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY 0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0C7F ; Telugu # So TELUGU SIGN TUUMU -# Total code points: 100 +# Total code points: 101 # ================================================ @@ -1186,14 +1194,14 @@ A8FF ; Devanagari # Mn DEVANAGARI VOWEL SIGN AY 0CCA..0CCB ; Kannada # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO 0CCC..0CCD ; Kannada # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA 0CD5..0CD6 ; Kannada # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK -0CDD..0CDE ; Kannada # Lo [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA +0CDC..0CDE ; Kannada # Lo [3] KANNADA ARCHAIC SHRII..KANNADA LETTER FA 0CE0..0CE1 ; Kannada # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL 0CE2..0CE3 ; Kannada # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL 0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA 0CF3 ; Kannada # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT -# Total code points: 91 +# Total code points: 92 # ================================================ @@ -1594,17 +1602,18 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI 16FE2 ; Han # Po OLD CHINESE HOOK MARK 16FE3 ; Han # Lm OLD CHINESE ITERATION MARK 16FF0..16FF1 ; Han # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +16FF2..16FF3 ; Han # Lm [2] CHINESE SMALL SIMPLIFIED ER..CHINESE SMALL TRADITIONAL ER +16FF4..16FF6 ; Han # Nl [3] YANGQIN SIGN SLOW ONE BEAT..YANGQIN SIGN SLOW TWO BEATS 20000..2A6DF ; Han # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF -2A700..2B739 ; Han # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 -2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D -2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 +2A700..2B81D ; Han # Lo [4382] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B81D +2B820..2CEAD ; Han # Lo [5774] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEAD 2CEB0..2EBE0 ; Han # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 2EBF0..2EE5D ; Han # Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D 2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D 30000..3134A ; Han # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A -31350..323AF ; Han # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF +31350..33479 ; Han # Lo [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479 -# Total code points: 99030 +# Total code points: 103351 # ================================================ @@ -1647,7 +1656,8 @@ A490..A4C6 ; Yi # So [55] YI RADICAL QOT..YI RADICAL KE 0951..0954 ; Inherited # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT 1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY -1ABF..1ACE ; Inherited # Mn [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T +1ABF..1ADD ; Inherited # Mn [31] COMBINING LATIN SMALL LETTER W BELOW..COMBINING DOT-AND-RING BELOW +1AE0..1AEB ; Inherited # Mn [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE 1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL @@ -1676,7 +1686,7 @@ FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CON 1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 657 +# Total code points: 684 # ================================================ @@ -2347,8 +2357,14 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI 111DB ; Sharada # Po SHARADA SIGN SIDDHAM 111DC ; Sharada # Lo SHARADA HEADSTROKE 111DD..111DF ; Sharada # Po [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 +11B60 ; Sharada # Mn SHARADA VOWEL SIGN OE +11B61 ; Sharada # Mc SHARADA VOWEL SIGN OOE +11B62..11B64 ; Sharada # Mn [3] SHARADA VOWEL SIGN UE..SHARADA VOWEL SIGN SHORT E +11B65 ; Sharada # Mc SHARADA VOWEL SIGN SHORT O +11B66 ; Sharada # Mn SHARADA VOWEL SIGN CANDRA E +11B67 ; Sharada # Mc SHARADA VOWEL SIGN CANDRA O -# Total code points: 96 +# Total code points: 104 # ================================================ @@ -2756,11 +2772,11 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI # ================================================ 16FE0 ; Tangut # Lm TANGUT ITERATION MARK -17000..187F7 ; Tangut # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 -18800..18AFF ; Tangut # Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 -18D00..18D08 ; Tangut # Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08 +17000..18AFF ; Tangut # Lo [6912] TANGUT IDEOGRAPH-17000..TANGUT COMPONENT-768 +18D00..18D1E ; Tangut # Lo [31] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D1E +18D80..18DF2 ; Tangut # Lo [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883 -# Total code points: 6914 +# Total code points: 7059 # ================================================ @@ -3125,4 +3141,42 @@ ABF0..ABF9 ; Meetei_Mayek # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DI # Total code points: 80 +# ================================================ + +10940..10959 ; Sidetic # Lo [26] SIDETIC LETTER N01..SIDETIC LETTER N26 + +# Total code points: 26 + +# ================================================ + +1E6C0..1E6DE ; Tai_Yo # Lo [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO +1E6E0..1E6E2 ; Tai_Yo # Lo [3] TAI YO LETTER AA..TAI YO LETTER UE +1E6E3 ; Tai_Yo # Mn TAI YO SIGN UE +1E6E4..1E6E5 ; Tai_Yo # Lo [2] TAI YO LETTER U..TAI YO LETTER AE +1E6E6 ; Tai_Yo # Mn TAI YO SIGN AU +1E6E7..1E6ED ; Tai_Yo # Lo [7] TAI YO LETTER O..TAI YO LETTER AUE +1E6EE..1E6EF ; Tai_Yo # Mn [2] TAI YO SIGN AY..TAI YO SIGN ANG +1E6F0..1E6F4 ; Tai_Yo # Lo [5] TAI YO LETTER AN..TAI YO LETTER AP +1E6F5 ; Tai_Yo # Mn TAI YO SIGN OM +1E6FE ; Tai_Yo # Lo TAI YO SYMBOL MUEANG +1E6FF ; Tai_Yo # Lm TAI YO XAM LAI + +# Total code points: 55 + +# ================================================ + +11DB0..11DD8 ; Tolong_Siki # Lo [41] TOLONG SIKI LETTER I..TOLONG SIKI LETTER RRH +11DD9 ; Tolong_Siki # Lm TOLONG SIKI SIGN SELA +11DDA..11DDB ; Tolong_Siki # Lo [2] TOLONG SIKI SIGN HECAKA..TOLONG SIKI UNGGA +11DE0..11DE9 ; Tolong_Siki # Nd [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE + +# Total code points: 54 + +# ================================================ + +16EA0..16EB8 ; Beria_Erfe # L& [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY +16EBB..16ED3 ; Beria_Erfe # L& [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY + +# Total code points: 50 + # EOF diff --git a/share/ucd/UnicodeData.txt b/share/ucd/UnicodeData.txt index 64258a373..fca68e3e1 100644 --- a/share/ucd/UnicodeData.txt +++ b/share/ucd/UnicodeData.txt @@ -659,7 +659,7 @@ 0292;LATIN SMALL LETTER EZH;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH;;01B7;;01B7 0293;LATIN SMALL LETTER EZH WITH CURL;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH CURL;;;; 0294;LATIN LETTER GLOTTAL STOP;Lo;0;L;;;;;N;;;;; -0295;LATIN LETTER PHARYNGEAL VOICED FRICATIVE;Ll;0;L;;;;;N;LATIN LETTER REVERSED GLOTTAL STOP;;;; +0295;LATIN LETTER PHARYNGEAL VOICED FRICATIVE;Lo;0;L;;;;;N;LATIN LETTER REVERSED GLOTTAL STOP;;;; 0296;LATIN LETTER INVERTED GLOTTAL STOP;Ll;0;L;;;;;N;;;;; 0297;LATIN LETTER STRETCHED C;Ll;0;L;;;;;N;;;;; 0298;LATIN LETTER BILABIAL CLICK;Ll;0;L;;;;;N;LATIN LETTER BULLSEYE;;;; @@ -2121,6 +2121,7 @@ 088C;ARABIC LETTER TAH WITH THREE DOTS BELOW;Lo;0;AL;;;;;N;;;;; 088D;ARABIC LETTER KEHEH WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; 088E;ARABIC VERTICAL TAIL;Lo;0;AL;;;;;N;;;;; +088F;ARABIC LETTER NOON WITH RING ABOVE;Lo;0;AL;;;;;N;;;;; 0890;ARABIC POUND MARK ABOVE;Cf;0;AN;;;;;N;;;;; 0891;ARABIC PIASTRE MARK ABOVE;Cf;0;AN;;;;;N;;;;; 0897;ARABIC PEPET;Mn;230;NSM;;;;;N;;;;; @@ -2862,6 +2863,7 @@ 0C58;TELUGU LETTER TSA;Lo;0;L;;;;;N;;;;; 0C59;TELUGU LETTER DZA;Lo;0;L;;;;;N;;;;; 0C5A;TELUGU LETTER RRRA;Lo;0;L;;;;;N;;;;; +0C5C;TELUGU ARCHAIC SHRII;Lo;0;L;;;;;N;;;;; 0C5D;TELUGU LETTER NAKAARA POLLU;Lo;0;L;;;;;N;;;;; 0C60;TELUGU LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; 0C61;TELUGU LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;; @@ -2958,6 +2960,7 @@ 0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; 0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;; 0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;; +0CDC;KANNADA ARCHAIC SHRII;Lo;0;L;;;;;N;;;;; 0CDD;KANNADA LETTER NAKAARA POLLU;Lo;0;L;;;;;N;;;;; 0CDE;KANNADA LETTER FA;Lo;0;L;;;;;N;;;;; 0CE0;KANNADA LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; @@ -6137,6 +6140,33 @@ 1ACC;COMBINING LATIN SMALL LETTER INSULAR G;Mn;230;NSM;;;;;N;;;;; 1ACD;COMBINING LATIN SMALL LETTER INSULAR R;Mn;230;NSM;;;;;N;;;;; 1ACE;COMBINING LATIN SMALL LETTER INSULAR T;Mn;230;NSM;;;;;N;;;;; +1ACF;COMBINING DOUBLE CARON;Mn;230;NSM;;;;;N;;;;; +1AD0;COMBINING VERTICAL-LINE-ACUTE;Mn;230;NSM;;;;;N;;;;; +1AD1;COMBINING GRAVE-VERTICAL-LINE;Mn;230;NSM;;;;;N;;;;; +1AD2;COMBINING VERTICAL-LINE-GRAVE;Mn;230;NSM;;;;;N;;;;; +1AD3;COMBINING ACUTE-VERTICAL-LINE;Mn;230;NSM;;;;;N;;;;; +1AD4;COMBINING VERTICAL-LINE-MACRON;Mn;230;NSM;;;;;N;;;;; +1AD5;COMBINING MACRON-VERTICAL-LINE;Mn;230;NSM;;;;;N;;;;; +1AD6;COMBINING VERTICAL-LINE-ACUTE-GRAVE;Mn;230;NSM;;;;;N;;;;; +1AD7;COMBINING VERTICAL-LINE-GRAVE-ACUTE;Mn;230;NSM;;;;;N;;;;; +1AD8;COMBINING MACRON-ACUTE-GRAVE;Mn;230;NSM;;;;;N;;;;; +1AD9;COMBINING SHARP SIGN;Mn;230;NSM;;;;;N;;;;; +1ADA;COMBINING FLAT SIGN;Mn;230;NSM;;;;;N;;;;; +1ADB;COMBINING DOWN TACK ABOVE;Mn;230;NSM;;;;;N;;;;; +1ADC;COMBINING DIAERESIS WITH RAISED LEFT DOT;Mn;230;NSM;;;;;N;;;;; +1ADD;COMBINING DOT-AND-RING BELOW;Mn;220;NSM;;;;;N;;;;; +1AE0;COMBINING LEFT TACK ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE1;COMBINING RIGHT TACK ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE2;COMBINING MINUS SIGN ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE3;COMBINING INVERTED BRIDGE ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE4;COMBINING SQUARE ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE5;COMBINING SEAGULL ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE6;COMBINING DOUBLE ARCH BELOW;Mn;220;NSM;;;;;N;;;;; +1AE7;COMBINING DOUBLE ARCH ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE8;COMBINING EQUALS SIGN ABOVE;Mn;230;NSM;;;;;N;;;;; +1AE9;COMBINING LEFT ANGLE CENTRED ABOVE;Mn;230;NSM;;;;;N;;;;; +1AEA;COMBINING UPWARDS ARROW ABOVE;Mn;230;NSM;;;;;N;;;;; +1AEB;COMBINING DOUBLE RIGHTWARDS ARROW ABOVE;Mn;234;NSM;;;;;N;;;;; 1B00;BALINESE SIGN ULU RICEM;Mn;0;NSM;;;;;N;;;;; 1B01;BALINESE SIGN ULU CANDRA;Mn;0;NSM;;;;;N;;;;; 1B02;BALINESE SIGN CECEK;Mn;0;NSM;;;;;N;;;;; @@ -7545,6 +7575,7 @@ 20BE;LARI SIGN;Sc;0;ET;;;;;N;;;;; 20BF;BITCOIN SIGN;Sc;0;ET;;;;;N;;;;; 20C0;SOM SIGN;Sc;0;ET;;;;;N;;;;; +20C1;SAUDI RIYAL SIGN;Sc;0;ET;;;;;N;;;;; 20D0;COMBINING LEFT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING LEFT HARPOON ABOVE;;;; 20D1;COMBINING RIGHT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING RIGHT HARPOON ABOVE;;;; 20D2;COMBINING LONG VERTICAL LINE OVERLAY;Mn;1;NSM;;;;;N;NON-SPACING LONG VERTICAL BAR OVERLAY;;;; @@ -10239,6 +10270,7 @@ 2B93;NEWLINE RIGHT;So;0;ON;;;;;N;;;;; 2B94;FOUR CORNER ARROWS CIRCLING ANTICLOCKWISE;So;0;ON;;;;;N;;;;; 2B95;RIGHTWARDS BLACK ARROW;So;0;ON;;;;;N;;;;; +2B96;EQUALS SIGN WITH INFINITY ABOVE;So;0;ON;;;;;N;;;;; 2B97;SYMBOL FOR TYPE A ELECTRONICS;So;0;ON;;;;;N;;;;; 2B98;THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; 2B99;THREE-D RIGHT-LIGHTED UPWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; @@ -14274,10 +14306,14 @@ A7CA;LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY;Ll;0;L;;;;;N;;;A7C9;;A7C9 A7CB;LATIN CAPITAL LETTER RAMS HORN;Lu;0;L;;;;;N;;;;0264; A7CC;LATIN CAPITAL LETTER S WITH DIAGONAL STROKE;Lu;0;L;;;;;N;;;;A7CD; A7CD;LATIN SMALL LETTER S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;A7CC;;A7CC +A7CE;LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE;Lu;0;L;;;;;N;;;;A7CF; +A7CF;LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE;Ll;0;L;;;;;N;;;A7CE;;A7CE A7D0;LATIN CAPITAL LETTER CLOSED INSULAR G;Lu;0;L;;;;;N;;;;A7D1; A7D1;LATIN SMALL LETTER CLOSED INSULAR G;Ll;0;L;;;;;N;;;A7D0;;A7D0 -A7D3;LATIN SMALL LETTER DOUBLE THORN;Ll;0;L;;;;;N;;;;; -A7D5;LATIN SMALL LETTER DOUBLE WYNN;Ll;0;L;;;;;N;;;;; +A7D2;LATIN CAPITAL LETTER DOUBLE THORN;Lu;0;L;;;;;N;;;;A7D3; +A7D3;LATIN SMALL LETTER DOUBLE THORN;Ll;0;L;;;;;N;;;A7D2;;A7D2 +A7D4;LATIN CAPITAL LETTER DOUBLE WYNN;Lu;0;L;;;;;N;;;;A7D5; +A7D5;LATIN SMALL LETTER DOUBLE WYNN;Ll;0;L;;;;;N;;;A7D4;;A7D4 A7D6;LATIN CAPITAL LETTER MIDDLE SCOTS S;Lu;0;L;;;;;N;;;;A7D7; A7D7;LATIN SMALL LETTER MIDDLE SCOTS S;Ll;0;L;;;;;N;;;A7D6;;A7D6 A7D8;LATIN CAPITAL LETTER SIGMOID S;Lu;0;L;;;;;N;;;;A7D9; @@ -14285,6 +14321,7 @@ A7D9;LATIN SMALL LETTER SIGMOID S;Ll;0;L;;;;;N;;;A7D8;;A7D8 A7DA;LATIN CAPITAL LETTER LAMBDA;Lu;0;L;;;;;N;;;;A7DB; A7DB;LATIN SMALL LETTER LAMBDA;Ll;0;L;;;;;N;;;A7DA;;A7DA A7DC;LATIN CAPITAL LETTER LAMBDA WITH STROKE;Lu;0;L;;;;;N;;;;019B; +A7F1;MODIFIER LETTER CAPITAL S;Lm;0;L; 0053;;;;N;;;;; A7F2;MODIFIER LETTER CAPITAL C;Lm;0;L; 0043;;;;N;;;;; A7F3;MODIFIER LETTER CAPITAL F;Lm;0;L; 0046;;;;N;;;;; A7F4;MODIFIER LETTER CAPITAL Q;Lm;0;L; 0051;;;;N;;;;; @@ -15925,6 +15962,22 @@ FBBF;ARABIC SYMBOL RING;Sk;0;AL;;;;;N;;;;; FBC0;ARABIC SYMBOL SMALL TAH ABOVE;Sk;0;AL;;;;;N;;;;; FBC1;ARABIC SYMBOL SMALL TAH BELOW;Sk;0;AL;;;;;N;;;;; FBC2;ARABIC SYMBOL WASLA ABOVE;Sk;0;AL;;;;;N;;;;; +FBC3;ARABIC LIGATURE JALLA WA-ALAA;So;0;ON;;;;;N;;;;; +FBC4;ARABIC LIGATURE DAAMAT BARAKAATUHUM;So;0;ON;;;;;N;;;;; +FBC5;ARABIC LIGATURE RAHMATU ALLAAHI TAAALAA ALAYH;So;0;ON;;;;;N;;;;; +FBC6;ARABIC LIGATURE RAHMATU ALLAAHI ALAYHIM;So;0;ON;;;;;N;;;;; +FBC7;ARABIC LIGATURE RAHMATU ALLAAHI ALAYHIMAA;So;0;ON;;;;;N;;;;; +FBC8;ARABIC LIGATURE RAHIMAHUM ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBC9;ARABIC LIGATURE RAHIMAHUMAA ALLAAH;So;0;ON;;;;;N;;;;; +FBCA;ARABIC LIGATURE RAHIMAHUMAA ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBCB;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANHUM;So;0;ON;;;;;N;;;;; +FBCC;ARABIC LIGATURE HAFIZAHU ALLAAH;So;0;ON;;;;;N;;;;; +FBCD;ARABIC LIGATURE HAFIZAHU ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBCE;ARABIC LIGATURE HAFIZAHUM ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBCF;ARABIC LIGATURE HAFIZAHUMAA ALLAAHU TAAALAA;So;0;ON;;;;;N;;;;; +FBD0;ARABIC LIGATURE SALLALLAAHU TAAALAA ALAYHI WA-SALLAM;So;0;ON;;;;;N;;;;; +FBD1;ARABIC LIGATURE AJJAL ALLAAHU FARAJAHU ASH-SHAREEF;So;0;ON;;;;;N;;;;; +FBD2;ARABIC LIGATURE ALAYHI AR-RAHMAH;So;0;ON;;;;;N;;;;; FBD3;ARABIC LETTER NG ISOLATED FORM;Lo;0;AL; 06AD;;;;N;;;;; FBD4;ARABIC LETTER NG FINAL FORM;Lo;0;AL; 06AD;;;;N;;;;; FBD5;ARABIC LETTER NG INITIAL FORM;Lo;0;AL; 06AD;;;;N;;;;; @@ -16370,6 +16423,8 @@ FD8C;ARABIC LIGATURE MEEM WITH JEEM WITH HAH INITIAL FORM;Lo;0;AL; 0645 FD8D;ARABIC LIGATURE MEEM WITH JEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0645 062C 0645;;;;N;;;;; FD8E;ARABIC LIGATURE MEEM WITH KHAH WITH JEEM INITIAL FORM;Lo;0;AL; 0645 062E 062C;;;;N;;;;; FD8F;ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM;Lo;0;AL; 0645 062E 0645;;;;N;;;;; +FD90;ARABIC LIGATURE RAHMATU ALLAAHI ALAYH;So;0;ON;;;;;N;;;;; +FD91;ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA;So;0;ON;;;;;N;;;;; FD92;ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM;Lo;0;AL; 0645 062C 062E;;;;N;;;;; FD93;ARABIC LIGATURE HEH WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL; 0647 0645 062C;;;;N;;;;; FD94;ARABIC LIGATURE HEH WITH MEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0647 0645 0645;;;;N;;;;; @@ -16424,6 +16479,13 @@ FDC4;ARABIC LIGATURE AIN WITH JEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0639 FDC5;ARABIC LIGATURE SAD WITH MEEM WITH MEEM INITIAL FORM;Lo;0;AL; 0635 0645 0645;;;;N;;;;; FDC6;ARABIC LIGATURE SEEN WITH KHAH WITH YEH FINAL FORM;Lo;0;AL; 0633 062E 064A;;;;N;;;;; FDC7;ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM;Lo;0;AL; 0646 062C 064A;;;;N;;;;; +FDC8;ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA;So;0;ON;;;;;N;;;;; +FDC9;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANH;So;0;ON;;;;;N;;;;; +FDCA;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANHAA;So;0;ON;;;;;N;;;;; +FDCB;ARABIC LIGATURE RADI ALLAAHU TAAALAA ANHUMAA;So;0;ON;;;;;N;;;;; +FDCC;ARABIC LIGATURE SALLALLAHU ALAYHI WA-ALAA AALIHEE WA-SALLAM;So;0;ON;;;;;N;;;;; +FDCD;ARABIC LIGATURE AJJAL ALLAAHU TAAALAA FARAJAHU ASH-SHAREEF;So;0;ON;;;;;N;;;;; +FDCE;ARABIC LIGATURE KARRAMA ALLAAHU WAJHAH;So;0;ON;;;;;N;;;;; FDCF;ARABIC LIGATURE SALAAMUHU ALAYNAA;So;0;ON;;;;;N;;;;; FDF0;ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM;Lo;0;AL; 0635 0644 06D2;;;;N;;;;; FDF1;ARABIC LIGATURE QALA USED AS KORANIC STOP SIGN ISOLATED FORM;Lo;0;AL; 0642 0644 06D2;;;;N;;;;; @@ -18708,6 +18770,32 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 10938;LYDIAN LETTER NN;Lo;0;R;;;;;N;;;;; 10939;LYDIAN LETTER C;Lo;0;R;;;;;N;;;;; 1093F;LYDIAN TRIANGULAR MARK;Po;0;R;;;;;N;;;;; +10940;SIDETIC LETTER N01;Lo;0;R;;;;;N;;;;; +10941;SIDETIC LETTER N02;Lo;0;R;;;;;N;;;;; +10942;SIDETIC LETTER N03;Lo;0;R;;;;;N;;;;; +10943;SIDETIC LETTER N04;Lo;0;R;;;;;N;;;;; +10944;SIDETIC LETTER N05;Lo;0;R;;;;;N;;;;; +10945;SIDETIC LETTER N06;Lo;0;R;;;;;N;;;;; +10946;SIDETIC LETTER N07;Lo;0;R;;;;;N;;;;; +10947;SIDETIC LETTER N08;Lo;0;R;;;;;N;;;;; +10948;SIDETIC LETTER N09;Lo;0;R;;;;;N;;;;; +10949;SIDETIC LETTER N10;Lo;0;R;;;;;N;;;;; +1094A;SIDETIC LETTER N11;Lo;0;R;;;;;N;;;;; +1094B;SIDETIC LETTER N12;Lo;0;R;;;;;N;;;;; +1094C;SIDETIC LETTER N13;Lo;0;R;;;;;N;;;;; +1094D;SIDETIC LETTER N14;Lo;0;R;;;;;N;;;;; +1094E;SIDETIC LETTER N15;Lo;0;R;;;;;N;;;;; +1094F;SIDETIC LETTER N16;Lo;0;R;;;;;N;;;;; +10950;SIDETIC LETTER N17;Lo;0;R;;;;;N;;;;; +10951;SIDETIC LETTER N18;Lo;0;R;;;;;N;;;;; +10952;SIDETIC LETTER N19;Lo;0;R;;;;;N;;;;; +10953;SIDETIC LETTER N20;Lo;0;R;;;;;N;;;;; +10954;SIDETIC LETTER N21;Lo;0;R;;;;;N;;;;; +10955;SIDETIC LETTER N22;Lo;0;R;;;;;N;;;;; +10956;SIDETIC LETTER N23;Lo;0;R;;;;;N;;;;; +10957;SIDETIC LETTER N24;Lo;0;R;;;;;N;;;;; +10958;SIDETIC LETTER N25;Lo;0;R;;;;;N;;;;; +10959;SIDETIC LETTER N26;Lo;0;R;;;;;N;;;;; 10980;MEROITIC HIEROGLYPHIC LETTER A;Lo;0;R;;;;;N;;;;; 10981;MEROITIC HIEROGLYPHIC LETTER E;Lo;0;R;;;;;N;;;;; 10982;MEROITIC HIEROGLYPHIC LETTER I;Lo;0;R;;;;;N;;;;; @@ -19541,6 +19629,20 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 10EC2;ARABIC LETTER DAL WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; 10EC3;ARABIC LETTER TAH WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; 10EC4;ARABIC LETTER KAF WITH TWO DOTS VERTICALLY BELOW;Lo;0;AL;;;;;N;;;;; +10EC5;ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW;Lm;0;AL;;;;;N;;;;; +10EC6;ARABIC LETTER THIN NOON;Lo;0;AL;;;;;N;;;;; +10EC7;ARABIC LETTER YEH WITH FOUR DOTS BELOW;Lo;0;AL;;;;;N;;;;; +10ED0;ARABIC BIBLICAL END OF VERSE;Po;0;ON;;;;;N;;;;; +10ED1;ARABIC LIGATURE ALAYHAA AS-SALAATU WAS-SALAAM;So;0;ON;;;;;N;;;;; +10ED2;ARABIC LIGATURE ALAYHIM AS-SALAATU WAS-SALAAM;So;0;ON;;;;;N;;;;; +10ED3;ARABIC LIGATURE ALAYHIMAA AS-SALAATU WAS-SALAAM;So;0;ON;;;;;N;;;;; +10ED4;ARABIC LIGATURE QADDASA ALLAAHU SIRRAH;So;0;ON;;;;;N;;;;; +10ED5;ARABIC LIGATURE QUDDISA SIRRUHUM;So;0;ON;;;;;N;;;;; +10ED6;ARABIC LIGATURE QUDDISA SIRRUHUMAA;So;0;ON;;;;;N;;;;; +10ED7;ARABIC LIGATURE QUDDISAT ASRAARUHUM;So;0;ON;;;;;N;;;;; +10ED8;ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH;So;0;ON;;;;;N;;;;; +10EFA;ARABIC DOUBLE VERTICAL BAR BELOW;Mn;220;NSM;;;;;N;;;;; +10EFB;ARABIC SMALL LOW NOON;Mn;220;NSM;;;;;N;;;;; 10EFC;ARABIC COMBINING ALEF OVERLAY;Mn;0;NSM;;;;;N;;;;; 10EFD;ARABIC SMALL LOW WORD SAKTA;Mn;220;NSM;;;;;N;;;;; 10EFE;ARABIC SMALL LOW WORD QASR;Mn;220;NSM;;;;;N;;;;; @@ -21521,6 +21623,14 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 11B07;DEVANAGARI SIGN WESTERN NINE-LIKE BHALE;Po;0;L;;;;;N;;;;; 11B08;DEVANAGARI SIGN REVERSED NINE-LIKE BHALE;Po;0;L;;;;;N;;;;; 11B09;DEVANAGARI SIGN MINDU;Po;0;L;;;;;N;;;;; +11B60;SHARADA VOWEL SIGN OE;Mn;0;NSM;;;;;N;;;;; +11B61;SHARADA VOWEL SIGN OOE;Mc;0;L;;;;;N;;;;; +11B62;SHARADA VOWEL SIGN UE;Mn;0;NSM;;;;;N;;;;; +11B63;SHARADA VOWEL SIGN UUE;Mn;0;NSM;;;;;N;;;;; +11B64;SHARADA VOWEL SIGN SHORT E;Mn;0;NSM;;;;;N;;;;; +11B65;SHARADA VOWEL SIGN SHORT O;Mc;0;L;;;;;N;;;;; +11B66;SHARADA VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;; +11B67;SHARADA VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;; 11BC0;SUNUWAR LETTER DEVI;Lo;0;L;;;;;N;;;;; 11BC1;SUNUWAR LETTER TASLA;Lo;0;L;;;;;N;;;;; 11BC2;SUNUWAR LETTER EKO;Lo;0;L;;;;;N;;;;; @@ -21868,6 +21978,60 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 11DA7;GUNJALA GONDI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; 11DA8;GUNJALA GONDI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 11DA9;GUNJALA GONDI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +11DB0;TOLONG SIKI LETTER I;Lo;0;L;;;;;N;;;;; +11DB1;TOLONG SIKI LETTER E;Lo;0;L;;;;;N;;;;; +11DB2;TOLONG SIKI LETTER U;Lo;0;L;;;;;N;;;;; +11DB3;TOLONG SIKI LETTER O;Lo;0;L;;;;;N;;;;; +11DB4;TOLONG SIKI LETTER A;Lo;0;L;;;;;N;;;;; +11DB5;TOLONG SIKI LETTER AA;Lo;0;L;;;;;N;;;;; +11DB6;TOLONG SIKI LETTER P;Lo;0;L;;;;;N;;;;; +11DB7;TOLONG SIKI LETTER PH;Lo;0;L;;;;;N;;;;; +11DB8;TOLONG SIKI LETTER B;Lo;0;L;;;;;N;;;;; +11DB9;TOLONG SIKI LETTER BH;Lo;0;L;;;;;N;;;;; +11DBA;TOLONG SIKI LETTER M;Lo;0;L;;;;;N;;;;; +11DBB;TOLONG SIKI LETTER T;Lo;0;L;;;;;N;;;;; +11DBC;TOLONG SIKI LETTER TH;Lo;0;L;;;;;N;;;;; +11DBD;TOLONG SIKI LETTER D;Lo;0;L;;;;;N;;;;; +11DBE;TOLONG SIKI LETTER DH;Lo;0;L;;;;;N;;;;; +11DBF;TOLONG SIKI LETTER N;Lo;0;L;;;;;N;;;;; +11DC0;TOLONG SIKI LETTER TT;Lo;0;L;;;;;N;;;;; +11DC1;TOLONG SIKI LETTER TTH;Lo;0;L;;;;;N;;;;; +11DC2;TOLONG SIKI LETTER DD;Lo;0;L;;;;;N;;;;; +11DC3;TOLONG SIKI LETTER DDH;Lo;0;L;;;;;N;;;;; +11DC4;TOLONG SIKI LETTER NN;Lo;0;L;;;;;N;;;;; +11DC5;TOLONG SIKI LETTER C;Lo;0;L;;;;;N;;;;; +11DC6;TOLONG SIKI LETTER CH;Lo;0;L;;;;;N;;;;; +11DC7;TOLONG SIKI LETTER J;Lo;0;L;;;;;N;;;;; +11DC8;TOLONG SIKI LETTER JH;Lo;0;L;;;;;N;;;;; +11DC9;TOLONG SIKI LETTER NY;Lo;0;L;;;;;N;;;;; +11DCA;TOLONG SIKI LETTER K;Lo;0;L;;;;;N;;;;; +11DCB;TOLONG SIKI LETTER KH;Lo;0;L;;;;;N;;;;; +11DCC;TOLONG SIKI LETTER G;Lo;0;L;;;;;N;;;;; +11DCD;TOLONG SIKI LETTER GH;Lo;0;L;;;;;N;;;;; +11DCE;TOLONG SIKI LETTER NG;Lo;0;L;;;;;N;;;;; +11DCF;TOLONG SIKI LETTER Y;Lo;0;L;;;;;N;;;;; +11DD0;TOLONG SIKI LETTER R;Lo;0;L;;;;;N;;;;; +11DD1;TOLONG SIKI LETTER L;Lo;0;L;;;;;N;;;;; +11DD2;TOLONG SIKI LETTER V;Lo;0;L;;;;;N;;;;; +11DD3;TOLONG SIKI LETTER NNY;Lo;0;L;;;;;N;;;;; +11DD4;TOLONG SIKI LETTER S;Lo;0;L;;;;;N;;;;; +11DD5;TOLONG SIKI LETTER H;Lo;0;L;;;;;N;;;;; +11DD6;TOLONG SIKI LETTER X;Lo;0;L;;;;;N;;;;; +11DD7;TOLONG SIKI LETTER RR;Lo;0;L;;;;;N;;;;; +11DD8;TOLONG SIKI LETTER RRH;Lo;0;L;;;;;N;;;;; +11DD9;TOLONG SIKI SIGN SELA;Lm;0;L;;;;;N;;;;; +11DDA;TOLONG SIKI SIGN HECAKA;Lo;0;L;;;;;N;;;;; +11DDB;TOLONG SIKI UNGGA;Lo;0;L;;;;;N;;;;; +11DE0;TOLONG SIKI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +11DE1;TOLONG SIKI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +11DE2;TOLONG SIKI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +11DE3;TOLONG SIKI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +11DE4;TOLONG SIKI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +11DE5;TOLONG SIKI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +11DE6;TOLONG SIKI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +11DE7;TOLONG SIKI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +11DE8;TOLONG SIKI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +11DE9;TOLONG SIKI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 11EE0;MAKASAR LETTER KA;Lo;0;L;;;;;N;;;;; 11EE1;MAKASAR LETTER GA;Lo;0;L;;;;;N;;;;; 11EE2;MAKASAR LETTER NGA;Lo;0;L;;;;;N;;;;; @@ -22088,8 +22252,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12035;CUNEIFORM SIGN ARAD TIMES KUR;Lo;0;L;;;;;N;;;;; 12036;CUNEIFORM SIGN ARKAB;Lo;0;L;;;;;N;;;;; 12037;CUNEIFORM SIGN ASAL2;Lo;0;L;;;;;N;;;;; -12038;CUNEIFORM SIGN ASH;Lo;0;L;;;;;N;;;;; -12039;CUNEIFORM SIGN ASH ZIDA TENU;Lo;0;L;;;;;N;;;;; +12038;CUNEIFORM SIGN ASH;Lo;0;L;;;;1;N;;;;; +12039;CUNEIFORM SIGN ASH ZIDA TENU;Lo;0;L;;;;1;N;;;;; 1203A;CUNEIFORM SIGN ASH KABA TENU;Lo;0;L;;;;;N;;;;; 1203B;CUNEIFORM SIGN ASH OVER ASH TUG2 OVER TUG2 TUG2 OVER TUG2 PAP;Lo;0;L;;;;;N;;;;; 1203C;CUNEIFORM SIGN ASH OVER ASH OVER ASH;Lo;0;L;;;;;N;;;;; @@ -22153,7 +22317,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12076;CUNEIFORM SIGN DIM2;Lo;0;L;;;;;N;;;;; 12077;CUNEIFORM SIGN DIN;Lo;0;L;;;;;N;;;;; 12078;CUNEIFORM SIGN DIN KASKAL U GUNU DISH;Lo;0;L;;;;;N;;;;; -12079;CUNEIFORM SIGN DISH;Lo;0;L;;;;;N;;;;; +12079;CUNEIFORM SIGN DISH;Lo;0;L;;;;1;N;;;;; 1207A;CUNEIFORM SIGN DU;Lo;0;L;;;;;N;;;;; 1207B;CUNEIFORM SIGN DU OVER DU;Lo;0;L;;;;;N;;;;; 1207C;CUNEIFORM SIGN DU GUNU;Lo;0;L;;;;;N;;;;; @@ -22582,12 +22746,12 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12223;CUNEIFORM SIGN MA2;Lo;0;L;;;;;N;;;;; 12224;CUNEIFORM SIGN MAH;Lo;0;L;;;;;N;;;;; 12225;CUNEIFORM SIGN MAR;Lo;0;L;;;;;N;;;;; -12226;CUNEIFORM SIGN MASH;Lo;0;L;;;;;N;;;;; +12226;CUNEIFORM SIGN MASH;Lo;0;L;;;;1/2;N;;;;; 12227;CUNEIFORM SIGN MASH2;Lo;0;L;;;;;N;;;;; 12228;CUNEIFORM SIGN ME;Lo;0;L;;;;;N;;;;; 12229;CUNEIFORM SIGN MES;Lo;0;L;;;;;N;;;;; 1222A;CUNEIFORM SIGN MI;Lo;0;L;;;;;N;;;;; -1222B;CUNEIFORM SIGN MIN;Lo;0;L;;;;;N;;;;; +1222B;CUNEIFORM SIGN MIN;Lo;0;L;;;;2;N;;;;; 1222C;CUNEIFORM SIGN MU;Lo;0;L;;;;;N;;;;; 1222D;CUNEIFORM SIGN MU OVER MU;Lo;0;L;;;;;N;;;;; 1222E;CUNEIFORM SIGN MUG;Lo;0;L;;;;;N;;;;; @@ -22811,9 +22975,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12308;CUNEIFORM SIGN TUM;Lo;0;L;;;;;N;;;;; 12309;CUNEIFORM SIGN TUR;Lo;0;L;;;;;N;;;;; 1230A;CUNEIFORM SIGN TUR OVER TUR ZA OVER ZA;Lo;0;L;;;;;N;;;;; -1230B;CUNEIFORM SIGN U;Lo;0;L;;;;;N;;;;; +1230B;CUNEIFORM SIGN U;Lo;0;L;;;;1;N;;;;; 1230C;CUNEIFORM SIGN U GUD;Lo;0;L;;;;;N;;;;; -1230D;CUNEIFORM SIGN U U U;Lo;0;L;;;;;N;;;;; +1230D;CUNEIFORM SIGN U U U;Lo;0;L;;;;3;N;;;;; 1230E;CUNEIFORM SIGN U OVER U PA OVER PA GAR OVER GAR;Lo;0;L;;;;;N;;;;; 1230F;CUNEIFORM SIGN U OVER U SUR OVER SUR;Lo;0;L;;;;;N;;;;; 12310;CUNEIFORM SIGN U OVER U U REVERSED OVER U REVERSED;Lo;0;L;;;;;N;;;;; @@ -22953,7 +23117,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 12396;CUNEIFORM SIGN SAG TIMES IGI GUNU;Lo;0;L;;;;;N;;;;; 12397;CUNEIFORM SIGN TI2;Lo;0;L;;;;;N;;;;; 12398;CUNEIFORM SIGN UM TIMES ME;Lo;0;L;;;;;N;;;;; -12399;CUNEIFORM SIGN U U;Lo;0;L;;;;;N;;;;; +12399;CUNEIFORM SIGN U U;Lo;0;L;;;;2;N;;;;; 12400;CUNEIFORM NUMERIC SIGN TWO ASH;Nl;0;L;;;;2;N;;;;; 12401;CUNEIFORM NUMERIC SIGN THREE ASH;Nl;0;L;;;;3;N;;;;; 12402;CUNEIFORM NUMERIC SIGN FOUR ASH;Nl;0;L;;;;4;N;;;;; @@ -30124,6 +30288,56 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 16E98;MEDEFAIDRIN FULL STOP;Po;0;L;;;;;N;;;;; 16E99;MEDEFAIDRIN SYMBOL AIVA;Po;0;L;;;;;N;;;;; 16E9A;MEDEFAIDRIN EXCLAMATION OH;Po;0;L;;;;;N;;;;; +16EA0;BERIA ERFE CAPITAL LETTER ARKAB;Lu;0;L;;;;;N;;;;16EBB; +16EA1;BERIA ERFE CAPITAL LETTER BASIGNA;Lu;0;L;;;;;N;;;;16EBC; +16EA2;BERIA ERFE CAPITAL LETTER DARBAI;Lu;0;L;;;;;N;;;;16EBD; +16EA3;BERIA ERFE CAPITAL LETTER EH;Lu;0;L;;;;;N;;;;16EBE; +16EA4;BERIA ERFE CAPITAL LETTER FITKO;Lu;0;L;;;;;N;;;;16EBF; +16EA5;BERIA ERFE CAPITAL LETTER GOWAY;Lu;0;L;;;;;N;;;;16EC0; +16EA6;BERIA ERFE CAPITAL LETTER HIRDEABO;Lu;0;L;;;;;N;;;;16EC1; +16EA7;BERIA ERFE CAPITAL LETTER I;Lu;0;L;;;;;N;;;;16EC2; +16EA8;BERIA ERFE CAPITAL LETTER DJAI;Lu;0;L;;;;;N;;;;16EC3; +16EA9;BERIA ERFE CAPITAL LETTER KOBO;Lu;0;L;;;;;N;;;;16EC4; +16EAA;BERIA ERFE CAPITAL LETTER LAKKO;Lu;0;L;;;;;N;;;;16EC5; +16EAB;BERIA ERFE CAPITAL LETTER MERI;Lu;0;L;;;;;N;;;;16EC6; +16EAC;BERIA ERFE CAPITAL LETTER NINI;Lu;0;L;;;;;N;;;;16EC7; +16EAD;BERIA ERFE CAPITAL LETTER GNA;Lu;0;L;;;;;N;;;;16EC8; +16EAE;BERIA ERFE CAPITAL LETTER NGAY;Lu;0;L;;;;;N;;;;16EC9; +16EAF;BERIA ERFE CAPITAL LETTER OI;Lu;0;L;;;;;N;;;;16ECA; +16EB0;BERIA ERFE CAPITAL LETTER PI;Lu;0;L;;;;;N;;;;16ECB; +16EB1;BERIA ERFE CAPITAL LETTER ERIGO;Lu;0;L;;;;;N;;;;16ECC; +16EB2;BERIA ERFE CAPITAL LETTER ERIGO TAMURA;Lu;0;L;;;;;N;;;;16ECD; +16EB3;BERIA ERFE CAPITAL LETTER SERI;Lu;0;L;;;;;N;;;;16ECE; +16EB4;BERIA ERFE CAPITAL LETTER SHEP;Lu;0;L;;;;;N;;;;16ECF; +16EB5;BERIA ERFE CAPITAL LETTER TATASOUE;Lu;0;L;;;;;N;;;;16ED0; +16EB6;BERIA ERFE CAPITAL LETTER UI;Lu;0;L;;;;;N;;;;16ED1; +16EB7;BERIA ERFE CAPITAL LETTER WASSE;Lu;0;L;;;;;N;;;;16ED2; +16EB8;BERIA ERFE CAPITAL LETTER AY;Lu;0;L;;;;;N;;;;16ED3; +16EBB;BERIA ERFE SMALL LETTER ARKAB;Ll;0;L;;;;;N;;;16EA0;;16EA0 +16EBC;BERIA ERFE SMALL LETTER BASIGNA;Ll;0;L;;;;;N;;;16EA1;;16EA1 +16EBD;BERIA ERFE SMALL LETTER DARBAI;Ll;0;L;;;;;N;;;16EA2;;16EA2 +16EBE;BERIA ERFE SMALL LETTER EH;Ll;0;L;;;;;N;;;16EA3;;16EA3 +16EBF;BERIA ERFE SMALL LETTER FITKO;Ll;0;L;;;;;N;;;16EA4;;16EA4 +16EC0;BERIA ERFE SMALL LETTER GOWAY;Ll;0;L;;;;;N;;;16EA5;;16EA5 +16EC1;BERIA ERFE SMALL LETTER HIRDEABO;Ll;0;L;;;;;N;;;16EA6;;16EA6 +16EC2;BERIA ERFE SMALL LETTER I;Ll;0;L;;;;;N;;;16EA7;;16EA7 +16EC3;BERIA ERFE SMALL LETTER DJAI;Ll;0;L;;;;;N;;;16EA8;;16EA8 +16EC4;BERIA ERFE SMALL LETTER KOBO;Ll;0;L;;;;;N;;;16EA9;;16EA9 +16EC5;BERIA ERFE SMALL LETTER LAKKO;Ll;0;L;;;;;N;;;16EAA;;16EAA +16EC6;BERIA ERFE SMALL LETTER MERI;Ll;0;L;;;;;N;;;16EAB;;16EAB +16EC7;BERIA ERFE SMALL LETTER NINI;Ll;0;L;;;;;N;;;16EAC;;16EAC +16EC8;BERIA ERFE SMALL LETTER GNA;Ll;0;L;;;;;N;;;16EAD;;16EAD +16EC9;BERIA ERFE SMALL LETTER NGAY;Ll;0;L;;;;;N;;;16EAE;;16EAE +16ECA;BERIA ERFE SMALL LETTER OI;Ll;0;L;;;;;N;;;16EAF;;16EAF +16ECB;BERIA ERFE SMALL LETTER PI;Ll;0;L;;;;;N;;;16EB0;;16EB0 +16ECC;BERIA ERFE SMALL LETTER ERIGO;Ll;0;L;;;;;N;;;16EB1;;16EB1 +16ECD;BERIA ERFE SMALL LETTER ERIGO TAMURA;Ll;0;L;;;;;N;;;16EB2;;16EB2 +16ECE;BERIA ERFE SMALL LETTER SERI;Ll;0;L;;;;;N;;;16EB3;;16EB3 +16ECF;BERIA ERFE SMALL LETTER SHEP;Ll;0;L;;;;;N;;;16EB4;;16EB4 +16ED0;BERIA ERFE SMALL LETTER TATASOUE;Ll;0;L;;;;;N;;;16EB5;;16EB5 +16ED1;BERIA ERFE SMALL LETTER UI;Ll;0;L;;;;;N;;;16EB6;;16EB6 +16ED2;BERIA ERFE SMALL LETTER WASSE;Ll;0;L;;;;;N;;;16EB7;;16EB7 +16ED3;BERIA ERFE SMALL LETTER AY;Ll;0;L;;;;;N;;;16EB8;;16EB8 16F00;MIAO LETTER PA;Lo;0;L;;;;;N;;;;; 16F01;MIAO LETTER BA;Lo;0;L;;;;;N;;;;; 16F02;MIAO LETTER YI PA;Lo;0;L;;;;;N;;;;; @@ -30280,8 +30494,13 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 16FE4;KHITAN SMALL SCRIPT FILLER;Mn;0;NSM;;;;;N;;;;; 16FF0;VIETNAMESE ALTERNATE READING MARK CA;Mc;6;L;;;;;N;;;;; 16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;; +16FF2;CHINESE SMALL SIMPLIFIED ER;Lm;0;L;;;;;N;;;;; +16FF3;CHINESE SMALL TRADITIONAL ER;Lm;0;L;;;;;N;;;;; +16FF4;YANGQIN SIGN SLOW ONE BEAT;Nl;0;L;;;;1;N;;;;; +16FF5;YANGQIN SIGN SLOW THREE HALF BEATS;Nl;0;L;;;;3/2;N;;;;; +16FF6;YANGQIN SIGN SLOW TWO BEATS;Nl;0;L;;;;2;N;;;;; 17000;;Lo;0;L;;;;;N;;;;; -187F7;;Lo;0;L;;;;;N;;;;; +187FF;;Lo;0;L;;;;;N;;;;; 18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;; 18801;TANGUT COMPONENT-002;Lo;0;L;;;;;N;;;;; 18802;TANGUT COMPONENT-003;Lo;0;L;;;;;N;;;;; @@ -31522,7 +31741,122 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 18CD5;KHITAN SMALL SCRIPT CHARACTER-18CD5;Lo;0;L;;;;;N;;;;; 18CFF;KHITAN SMALL SCRIPT CHARACTER-18CFF;Lo;0;L;;;;;N;;;;; 18D00;;Lo;0;L;;;;;N;;;;; -18D08;;Lo;0;L;;;;;N;;;;; +18D1E;;Lo;0;L;;;;;N;;;;; +18D80;TANGUT COMPONENT-769;Lo;0;L;;;;;N;;;;; +18D81;TANGUT COMPONENT-770;Lo;0;L;;;;;N;;;;; +18D82;TANGUT COMPONENT-771;Lo;0;L;;;;;N;;;;; +18D83;TANGUT COMPONENT-772;Lo;0;L;;;;;N;;;;; +18D84;TANGUT COMPONENT-773;Lo;0;L;;;;;N;;;;; +18D85;TANGUT COMPONENT-774;Lo;0;L;;;;;N;;;;; +18D86;TANGUT COMPONENT-775;Lo;0;L;;;;;N;;;;; +18D87;TANGUT COMPONENT-776;Lo;0;L;;;;;N;;;;; +18D88;TANGUT COMPONENT-777;Lo;0;L;;;;;N;;;;; +18D89;TANGUT COMPONENT-778;Lo;0;L;;;;;N;;;;; +18D8A;TANGUT COMPONENT-779;Lo;0;L;;;;;N;;;;; +18D8B;TANGUT COMPONENT-780;Lo;0;L;;;;;N;;;;; +18D8C;TANGUT COMPONENT-781;Lo;0;L;;;;;N;;;;; +18D8D;TANGUT COMPONENT-782;Lo;0;L;;;;;N;;;;; +18D8E;TANGUT COMPONENT-783;Lo;0;L;;;;;N;;;;; +18D8F;TANGUT COMPONENT-784;Lo;0;L;;;;;N;;;;; +18D90;TANGUT COMPONENT-785;Lo;0;L;;;;;N;;;;; +18D91;TANGUT COMPONENT-786;Lo;0;L;;;;;N;;;;; +18D92;TANGUT COMPONENT-787;Lo;0;L;;;;;N;;;;; +18D93;TANGUT COMPONENT-788;Lo;0;L;;;;;N;;;;; +18D94;TANGUT COMPONENT-789;Lo;0;L;;;;;N;;;;; +18D95;TANGUT COMPONENT-790;Lo;0;L;;;;;N;;;;; +18D96;TANGUT COMPONENT-791;Lo;0;L;;;;;N;;;;; +18D97;TANGUT COMPONENT-792;Lo;0;L;;;;;N;;;;; +18D98;TANGUT COMPONENT-793;Lo;0;L;;;;;N;;;;; +18D99;TANGUT COMPONENT-794;Lo;0;L;;;;;N;;;;; +18D9A;TANGUT COMPONENT-795;Lo;0;L;;;;;N;;;;; +18D9B;TANGUT COMPONENT-796;Lo;0;L;;;;;N;;;;; +18D9C;TANGUT COMPONENT-797;Lo;0;L;;;;;N;;;;; +18D9D;TANGUT COMPONENT-798;Lo;0;L;;;;;N;;;;; +18D9E;TANGUT COMPONENT-799;Lo;0;L;;;;;N;;;;; +18D9F;TANGUT COMPONENT-800;Lo;0;L;;;;;N;;;;; +18DA0;TANGUT COMPONENT-801;Lo;0;L;;;;;N;;;;; +18DA1;TANGUT COMPONENT-802;Lo;0;L;;;;;N;;;;; +18DA2;TANGUT COMPONENT-803;Lo;0;L;;;;;N;;;;; +18DA3;TANGUT COMPONENT-804;Lo;0;L;;;;;N;;;;; +18DA4;TANGUT COMPONENT-805;Lo;0;L;;;;;N;;;;; +18DA5;TANGUT COMPONENT-806;Lo;0;L;;;;;N;;;;; +18DA6;TANGUT COMPONENT-807;Lo;0;L;;;;;N;;;;; +18DA7;TANGUT COMPONENT-808;Lo;0;L;;;;;N;;;;; +18DA8;TANGUT COMPONENT-809;Lo;0;L;;;;;N;;;;; +18DA9;TANGUT COMPONENT-810;Lo;0;L;;;;;N;;;;; +18DAA;TANGUT COMPONENT-811;Lo;0;L;;;;;N;;;;; +18DAB;TANGUT COMPONENT-812;Lo;0;L;;;;;N;;;;; +18DAC;TANGUT COMPONENT-813;Lo;0;L;;;;;N;;;;; +18DAD;TANGUT COMPONENT-814;Lo;0;L;;;;;N;;;;; +18DAE;TANGUT COMPONENT-815;Lo;0;L;;;;;N;;;;; +18DAF;TANGUT COMPONENT-816;Lo;0;L;;;;;N;;;;; +18DB0;TANGUT COMPONENT-817;Lo;0;L;;;;;N;;;;; +18DB1;TANGUT COMPONENT-818;Lo;0;L;;;;;N;;;;; +18DB2;TANGUT COMPONENT-819;Lo;0;L;;;;;N;;;;; +18DB3;TANGUT COMPONENT-820;Lo;0;L;;;;;N;;;;; +18DB4;TANGUT COMPONENT-821;Lo;0;L;;;;;N;;;;; +18DB5;TANGUT COMPONENT-822;Lo;0;L;;;;;N;;;;; +18DB6;TANGUT COMPONENT-823;Lo;0;L;;;;;N;;;;; +18DB7;TANGUT COMPONENT-824;Lo;0;L;;;;;N;;;;; +18DB8;TANGUT COMPONENT-825;Lo;0;L;;;;;N;;;;; +18DB9;TANGUT COMPONENT-826;Lo;0;L;;;;;N;;;;; +18DBA;TANGUT COMPONENT-827;Lo;0;L;;;;;N;;;;; +18DBB;TANGUT COMPONENT-828;Lo;0;L;;;;;N;;;;; +18DBC;TANGUT COMPONENT-829;Lo;0;L;;;;;N;;;;; +18DBD;TANGUT COMPONENT-830;Lo;0;L;;;;;N;;;;; +18DBE;TANGUT COMPONENT-831;Lo;0;L;;;;;N;;;;; +18DBF;TANGUT COMPONENT-832;Lo;0;L;;;;;N;;;;; +18DC0;TANGUT COMPONENT-833;Lo;0;L;;;;;N;;;;; +18DC1;TANGUT COMPONENT-834;Lo;0;L;;;;;N;;;;; +18DC2;TANGUT COMPONENT-835;Lo;0;L;;;;;N;;;;; +18DC3;TANGUT COMPONENT-836;Lo;0;L;;;;;N;;;;; +18DC4;TANGUT COMPONENT-837;Lo;0;L;;;;;N;;;;; +18DC5;TANGUT COMPONENT-838;Lo;0;L;;;;;N;;;;; +18DC6;TANGUT COMPONENT-839;Lo;0;L;;;;;N;;;;; +18DC7;TANGUT COMPONENT-840;Lo;0;L;;;;;N;;;;; +18DC8;TANGUT COMPONENT-841;Lo;0;L;;;;;N;;;;; +18DC9;TANGUT COMPONENT-842;Lo;0;L;;;;;N;;;;; +18DCA;TANGUT COMPONENT-843;Lo;0;L;;;;;N;;;;; +18DCB;TANGUT COMPONENT-844;Lo;0;L;;;;;N;;;;; +18DCC;TANGUT COMPONENT-845;Lo;0;L;;;;;N;;;;; +18DCD;TANGUT COMPONENT-846;Lo;0;L;;;;;N;;;;; +18DCE;TANGUT COMPONENT-847;Lo;0;L;;;;;N;;;;; +18DCF;TANGUT COMPONENT-848;Lo;0;L;;;;;N;;;;; +18DD0;TANGUT COMPONENT-849;Lo;0;L;;;;;N;;;;; +18DD1;TANGUT COMPONENT-850;Lo;0;L;;;;;N;;;;; +18DD2;TANGUT COMPONENT-851;Lo;0;L;;;;;N;;;;; +18DD3;TANGUT COMPONENT-852;Lo;0;L;;;;;N;;;;; +18DD4;TANGUT COMPONENT-853;Lo;0;L;;;;;N;;;;; +18DD5;TANGUT COMPONENT-854;Lo;0;L;;;;;N;;;;; +18DD6;TANGUT COMPONENT-855;Lo;0;L;;;;;N;;;;; +18DD7;TANGUT COMPONENT-856;Lo;0;L;;;;;N;;;;; +18DD8;TANGUT COMPONENT-857;Lo;0;L;;;;;N;;;;; +18DD9;TANGUT COMPONENT-858;Lo;0;L;;;;;N;;;;; +18DDA;TANGUT COMPONENT-859;Lo;0;L;;;;;N;;;;; +18DDB;TANGUT COMPONENT-860;Lo;0;L;;;;;N;;;;; +18DDC;TANGUT COMPONENT-861;Lo;0;L;;;;;N;;;;; +18DDD;TANGUT COMPONENT-862;Lo;0;L;;;;;N;;;;; +18DDE;TANGUT COMPONENT-863;Lo;0;L;;;;;N;;;;; +18DDF;TANGUT COMPONENT-864;Lo;0;L;;;;;N;;;;; +18DE0;TANGUT COMPONENT-865;Lo;0;L;;;;;N;;;;; +18DE1;TANGUT COMPONENT-866;Lo;0;L;;;;;N;;;;; +18DE2;TANGUT COMPONENT-867;Lo;0;L;;;;;N;;;;; +18DE3;TANGUT COMPONENT-868;Lo;0;L;;;;;N;;;;; +18DE4;TANGUT COMPONENT-869;Lo;0;L;;;;;N;;;;; +18DE5;TANGUT COMPONENT-870;Lo;0;L;;;;;N;;;;; +18DE6;TANGUT COMPONENT-871;Lo;0;L;;;;;N;;;;; +18DE7;TANGUT COMPONENT-872;Lo;0;L;;;;;N;;;;; +18DE8;TANGUT COMPONENT-873;Lo;0;L;;;;;N;;;;; +18DE9;TANGUT COMPONENT-874;Lo;0;L;;;;;N;;;;; +18DEA;TANGUT COMPONENT-875;Lo;0;L;;;;;N;;;;; +18DEB;TANGUT COMPONENT-876;Lo;0;L;;;;;N;;;;; +18DEC;TANGUT COMPONENT-877;Lo;0;L;;;;;N;;;;; +18DED;TANGUT COMPONENT-878;Lo;0;L;;;;;N;;;;; +18DEE;TANGUT COMPONENT-879;Lo;0;L;;;;;N;;;;; +18DEF;TANGUT COMPONENT-880;Lo;0;L;;;;;N;;;;; +18DF0;TANGUT COMPONENT-881;Lo;0;L;;;;;N;;;;; +18DF1;TANGUT COMPONENT-882;Lo;0;L;;;;;N;;;;; +18DF2;TANGUT COMPONENT-883;Lo;0;L;;;;;N;;;;; 1AFF0;KATAKANA LETTER MINNAN TONE-2;Lm;0;L;;;;;N;;;;; 1AFF1;KATAKANA LETTER MINNAN TONE-3;Lm;0;L;;;;;N;;;;; 1AFF2;KATAKANA LETTER MINNAN TONE-4;Lm;0;L;;;;;N;;;;; @@ -32629,6 +32963,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1CCF7;OUTLINED DIGIT SEVEN;Nd;0;EN; 0037;7;7;7;N;;;;; 1CCF8;OUTLINED DIGIT EIGHT;Nd;0;EN; 0038;8;8;8;N;;;;; 1CCF9;OUTLINED DIGIT NINE;Nd;0;EN; 0039;9;9;9;N;;;;; +1CCFA;SNAKE SYMBOL;So;0;ON;;;;;N;;;;; +1CCFB;FLYING SAUCER SYMBOL;So;0;ON;;;;;N;;;;; +1CCFC;NOSE SYMBOL;So;0;ON;;;;;N;;;;; 1CD00;BLOCK OCTANT-3;So;0;ON;;;;;N;;;;; 1CD01;BLOCK OCTANT-23;So;0;ON;;;;;N;;;;; 1CD02;BLOCK OCTANT-123;So;0;ON;;;;;N;;;;; @@ -33065,6 +33402,46 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1CEB1;KEYHOLE;So;0;ON;;;;;N;;;;; 1CEB2;OLD PERSONAL COMPUTER WITH MONITOR IN PORTRAIT ORIENTATION;So;0;ON;;;;;N;;;;; 1CEB3;BLACK RIGHT TRIANGLE CARET;So;0;ON;;;;;N;;;;; +1CEBA;FRAGILE SYMBOL;So;0;ON;;;;;N;;;;; +1CEBB;OFFICE BUILDING SYMBOL;So;0;ON;;;;;N;;;;; +1CEBC;TREE SYMBOL;So;0;ON;;;;;N;;;;; +1CEBD;APPLE SYMBOL;So;0;ON;;;;;N;;;;; +1CEBE;CHERRY SYMBOL;So;0;ON;;;;;N;;;;; +1CEBF;STRAWBERRY SYMBOL;So;0;ON;;;;;N;;;;; +1CEC0;HEBE;So;0;ON;;;;;N;;;;; +1CEC1;IRIS;So;0;ON;;;;;N;;;;; +1CEC2;FLORA;So;0;ON;;;;;N;;;;; +1CEC3;METIS;So;0;ON;;;;;N;;;;; +1CEC4;PARTHENOPE;So;0;ON;;;;;N;;;;; +1CEC5;VICTORIA;So;0;ON;;;;;N;;;;; +1CEC6;EGERIA;So;0;ON;;;;;N;;;;; +1CEC7;IRENE;So;0;ON;;;;;N;;;;; +1CEC8;EUNOMIA;So;0;ON;;;;;N;;;;; +1CEC9;PSYCHE;So;0;ON;;;;;N;;;;; +1CECA;THETIS;So;0;ON;;;;;N;;;;; +1CECB;MELPOMENE;So;0;ON;;;;;N;;;;; +1CECC;FORTUNA;So;0;ON;;;;;N;;;;; +1CECD;ASTRONOMICAL SYMBOL FOR ASTEROID PROSERPINA;So;0;ON;;;;;N;;;;; +1CECE;BELLONA;So;0;ON;;;;;N;;;;; +1CECF;AMPHITRITE;So;0;ON;;;;;N;;;;; +1CED0;LEUKOTHEA;So;0;ON;;;;;N;;;;; +1CEE0;GEOMANTIC FIGURE POPULUS;So;0;ON;;;;;N;;;;; +1CEE1;GEOMANTIC FIGURE TRISTITIA;So;0;ON;;;;;N;;;;; +1CEE2;GEOMANTIC FIGURE ALBUS;So;0;ON;;;;;N;;;;; +1CEE3;GEOMANTIC FIGURE FORTUNA MAJOR;So;0;ON;;;;;N;;;;; +1CEE4;GEOMANTIC FIGURE RUBEUS;So;0;ON;;;;;N;;;;; +1CEE5;GEOMANTIC FIGURE ACQUISITIO;So;0;ON;;;;;N;;;;; +1CEE6;GEOMANTIC FIGURE CONJUNCTIO;So;0;ON;;;;;N;;;;; +1CEE7;GEOMANTIC FIGURE CAPUT DRACONIS;So;0;ON;;;;;N;;;;; +1CEE8;GEOMANTIC FIGURE LAETITIA;So;0;ON;;;;;N;;;;; +1CEE9;GEOMANTIC FIGURE CARCER;So;0;ON;;;;;N;;;;; +1CEEA;GEOMANTIC FIGURE AMISSIO;So;0;ON;;;;;N;;;;; +1CEEB;GEOMANTIC FIGURE PUELLA;So;0;ON;;;;;N;;;;; +1CEEC;GEOMANTIC FIGURE FORTUNA MINOR;So;0;ON;;;;;N;;;;; +1CEED;GEOMANTIC FIGURE PUER;So;0;ON;;;;;N;;;;; +1CEEE;GEOMANTIC FIGURE CAUDA DRACONIS;So;0;ON;;;;;N;;;;; +1CEEF;GEOMANTIC FIGURE VIA;So;0;ON;;;;;N;;;;; +1CEF0;MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR;Sm;0;ON;;;;;N;;;;; 1CF00;ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT;Mn;0;NSM;;;;;N;;;;; 1CF01;ZNAMENNY COMBINING MARK NIZKO S KRYZHEM ON LEFT;Mn;0;NSM;;;;;N;;;;; 1CF02;ZNAMENNY COMBINING MARK TSATA ON LEFT;Mn;0;NSM;;;;;N;;;;; @@ -36004,6 +36381,61 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1E5F9;OL ONAL DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 1E5FA;OL ONAL DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 1E5FF;OL ONAL ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; +1E6C0;TAI YO LETTER LOW KO;Lo;0;L;;;;;N;;;;; +1E6C1;TAI YO LETTER HIGH KO;Lo;0;L;;;;;N;;;;; +1E6C2;TAI YO LETTER LOW KHO;Lo;0;L;;;;;N;;;;; +1E6C3;TAI YO LETTER HIGH KHO;Lo;0;L;;;;;N;;;;; +1E6C4;TAI YO LETTER GO;Lo;0;L;;;;;N;;;;; +1E6C5;TAI YO LETTER NGO;Lo;0;L;;;;;N;;;;; +1E6C6;TAI YO LETTER CO;Lo;0;L;;;;;N;;;;; +1E6C7;TAI YO LETTER LOW XO;Lo;0;L;;;;;N;;;;; +1E6C8;TAI YO LETTER HIGH XO;Lo;0;L;;;;;N;;;;; +1E6C9;TAI YO LETTER LOW NYO;Lo;0;L;;;;;N;;;;; +1E6CA;TAI YO LETTER HIGH NYO;Lo;0;L;;;;;N;;;;; +1E6CB;TAI YO LETTER DO;Lo;0;L;;;;;N;;;;; +1E6CC;TAI YO LETTER LOW TO;Lo;0;L;;;;;N;;;;; +1E6CD;TAI YO LETTER HIGH TO;Lo;0;L;;;;;N;;;;; +1E6CE;TAI YO LETTER THO;Lo;0;L;;;;;N;;;;; +1E6CF;TAI YO LETTER NO;Lo;0;L;;;;;N;;;;; +1E6D0;TAI YO LETTER BO;Lo;0;L;;;;;N;;;;; +1E6D1;TAI YO LETTER LOW PO;Lo;0;L;;;;;N;;;;; +1E6D2;TAI YO LETTER HIGH PO;Lo;0;L;;;;;N;;;;; +1E6D3;TAI YO LETTER PHO;Lo;0;L;;;;;N;;;;; +1E6D4;TAI YO LETTER LOW FO;Lo;0;L;;;;;N;;;;; +1E6D5;TAI YO LETTER HIGH FO;Lo;0;L;;;;;N;;;;; +1E6D6;TAI YO LETTER MO;Lo;0;L;;;;;N;;;;; +1E6D7;TAI YO LETTER YO;Lo;0;L;;;;;N;;;;; +1E6D8;TAI YO LETTER LO;Lo;0;L;;;;;N;;;;; +1E6D9;TAI YO LETTER VO;Lo;0;L;;;;;N;;;;; +1E6DA;TAI YO LETTER LOW HO;Lo;0;L;;;;;N;;;;; +1E6DB;TAI YO LETTER HIGH HO;Lo;0;L;;;;;N;;;;; +1E6DC;TAI YO LETTER QO;Lo;0;L;;;;;N;;;;; +1E6DD;TAI YO LETTER LOW KVO;Lo;0;L;;;;;N;;;;; +1E6DE;TAI YO LETTER HIGH KVO;Lo;0;L;;;;;N;;;;; +1E6E0;TAI YO LETTER AA;Lo;0;L;;;;;N;;;;; +1E6E1;TAI YO LETTER I;Lo;0;L;;;;;N;;;;; +1E6E2;TAI YO LETTER UE;Lo;0;L;;;;;N;;;;; +1E6E3;TAI YO SIGN UE;Mn;230;NSM;;;;;N;;;;; +1E6E4;TAI YO LETTER U;Lo;0;L;;;;;N;;;;; +1E6E5;TAI YO LETTER AE;Lo;0;L;;;;;N;;;;; +1E6E6;TAI YO SIGN AU;Mn;230;NSM;;;;;N;;;;; +1E6E7;TAI YO LETTER O;Lo;0;L;;;;;N;;;;; +1E6E8;TAI YO LETTER E;Lo;0;L;;;;;N;;;;; +1E6E9;TAI YO LETTER IA;Lo;0;L;;;;;N;;;;; +1E6EA;TAI YO LETTER UEA;Lo;0;L;;;;;N;;;;; +1E6EB;TAI YO LETTER UA;Lo;0;L;;;;;N;;;;; +1E6EC;TAI YO LETTER OO;Lo;0;L;;;;;N;;;;; +1E6ED;TAI YO LETTER AUE;Lo;0;L;;;;;N;;;;; +1E6EE;TAI YO SIGN AY;Mn;230;NSM;;;;;N;;;;; +1E6EF;TAI YO SIGN ANG;Mn;230;NSM;;;;;N;;;;; +1E6F0;TAI YO LETTER AN;Lo;0;L;;;;;N;;;;; +1E6F1;TAI YO LETTER AM;Lo;0;L;;;;;N;;;;; +1E6F2;TAI YO LETTER AK;Lo;0;L;;;;;N;;;;; +1E6F3;TAI YO LETTER AT;Lo;0;L;;;;;N;;;;; +1E6F4;TAI YO LETTER AP;Lo;0;L;;;;;N;;;;; +1E6F5;TAI YO SIGN OM;Mn;230;NSM;;;;;N;;;;; +1E6FE;TAI YO SYMBOL MUEANG;Lo;0;L;;;;;N;;;;; +1E6FF;TAI YO XAM LAI;Lm;0;L;;;;;N;;;;; 1E7E0;ETHIOPIC SYLLABLE HHYA;Lo;0;L;;;;;N;;;;; 1E7E1;ETHIOPIC SYLLABLE HHYU;Lo;0;L;;;;;N;;;;; 1E7E2;ETHIOPIC SYLLABLE HHYI;Lo;0;L;;;;;N;;;;; @@ -38079,6 +38511,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1F6D5;HINDU TEMPLE;So;0;ON;;;;;N;;;;; 1F6D6;HUT;So;0;ON;;;;;N;;;;; 1F6D7;ELEVATOR;So;0;ON;;;;;N;;;;; +1F6D8;LANDSLIDE;So;0;ON;;;;;N;;;;; 1F6DC;WIRELESS;So;0;ON;;;;;N;;;;; 1F6DD;PLAYGROUND SLIDE;So;0;ON;;;;;N;;;;; 1F6DE;WHEEL;So;0;ON;;;;;N;;;;; @@ -38228,6 +38661,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1F774;LOT OF FORTUNE;So;0;ON;;;;;N;;;;; 1F775;OCCULTATION;So;0;ON;;;;;N;;;;; 1F776;LUNAR ECLIPSE;So;0;ON;;;;;N;;;;; +1F777;VESTA FORM TWO;So;0;ON;;;;;N;;;;; +1F778;ASTRAEA FORM TWO;So;0;ON;;;;;N;;;;; +1F779;HYGIEA FORM TWO;So;0;ON;;;;;N;;;;; +1F77A;PARTHENOPE FORM TWO;So;0;ON;;;;;N;;;;; 1F77B;HAUMEA;So;0;ON;;;;;N;;;;; 1F77C;MAKEMAKE;So;0;ON;;;;;N;;;;; 1F77D;GONGGONG;So;0;ON;;;;;N;;;;; @@ -38498,6 +38935,15 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1F8BB;SOUTH WEST ARROW FROM BAR;So;0;ON;;;;;N;;;;; 1F8C0;LEFTWARDS ARROW FROM DOWNWARDS ARROW;So;0;ON;;;;;N;;;;; 1F8C1;RIGHTWARDS ARROW FROM DOWNWARDS ARROW;So;0;ON;;;;;N;;;;; +1F8D0;LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW;Sm;0;ON;;;;;N;;;;; +1F8D1;LONG RIGHTWARDS HARPOON OVER LONG LEFTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D2;LONG RIGHTWARDS HARPOON ABOVE SHORT LEFTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D3;SHORT RIGHTWARDS HARPOON ABOVE LONG LEFTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D4;LONG LEFTWARDS HARPOON ABOVE SHORT RIGHTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D5;SHORT LEFTWARDS HARPOON ABOVE LONG RIGHTWARDS HARPOON;Sm;0;ON;;;;;N;;;;; +1F8D6;LONG RIGHTWARDS ARROW THROUGH X;Sm;0;ON;;;;;N;;;;; +1F8D7;LONG RIGHTWARDS ARROW WITH DOUBLE SLASH;Sm;0;ON;;;;;N;;;;; +1F8D8;LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE;Sm;0;ON;;;;;N;;;;; 1F900;CIRCLED CROSS FORMEE WITH FOUR DOTS;So;0;ON;;;;;N;;;;; 1F901;CIRCLED CROSS FORMEE WITH TWO DOTS;So;0;ON;;;;;N;;;;; 1F902;CIRCLED CROSS FORMEE;So;0;ON;;;;;N;;;;; @@ -38838,6 +39284,10 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FA51;BLACK CHESS KNIGHT-QUEEN;So;0;ON;;;;;N;;;;; 1FA52;BLACK CHESS KNIGHT-ROOK;So;0;ON;;;;;N;;;;; 1FA53;BLACK CHESS KNIGHT-BISHOP;So;0;ON;;;;;N;;;;; +1FA54;WHITE CHESS FERZ;So;0;ON;;;;;N;;;;; +1FA55;WHITE CHESS ALFIL;So;0;ON;;;;;N;;;;; +1FA56;BLACK CHESS FERZ;So;0;ON;;;;;N;;;;; +1FA57;BLACK CHESS ALFIL;So;0;ON;;;;;N;;;;; 1FA60;XIANGQI RED GENERAL;So;0;ON;;;;;N;;;;; 1FA61;XIANGQI RED MANDARIN;So;0;ON;;;;;N;;;;; 1FA62;XIANGQI RED ELEPHANT;So;0;ON;;;;;N;;;;; @@ -38875,6 +39325,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FA87;MARACAS;So;0;ON;;;;;N;;;;; 1FA88;FLUTE;So;0;ON;;;;;N;;;;; 1FA89;HARP;So;0;ON;;;;;N;;;;; +1FA8A;TROMBONE;So;0;ON;;;;;N;;;;; +1FA8E;TREASURE CHEST;So;0;ON;;;;;N;;;;; 1FA8F;SHOVEL;So;0;ON;;;;;N;;;;; 1FA90;RINGED PLANET;So;0;ON;;;;;N;;;;; 1FA91;CHAIR;So;0;ON;;;;;N;;;;; @@ -38931,6 +39383,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FAC4;PREGNANT PERSON;So;0;ON;;;;;N;;;;; 1FAC5;PERSON WITH CROWN;So;0;ON;;;;;N;;;;; 1FAC6;FINGERPRINT;So;0;ON;;;;;N;;;;; +1FAC8;HAIRY CREATURE;So;0;ON;;;;;N;;;;; +1FACD;ORCA;So;0;ON;;;;;N;;;;; 1FACE;MOOSE;So;0;ON;;;;;N;;;;; 1FACF;DONKEY;So;0;ON;;;;;N;;;;; 1FAD0;BLUEBERRIES;So;0;ON;;;;;N;;;;; @@ -38957,6 +39411,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FAE7;BUBBLES;So;0;ON;;;;;N;;;;; 1FAE8;SHAKING FACE;So;0;ON;;;;;N;;;;; 1FAE9;FACE WITH BAGS UNDER EYES;So;0;ON;;;;;N;;;;; +1FAEA;DISTORTED FACE;So;0;ON;;;;;N;;;;; +1FAEF;FIGHT CLOUD;So;0;ON;;;;;N;;;;; 1FAF0;HAND WITH INDEX FINGER AND THUMB CROSSED;So;0;ON;;;;;N;;;;; 1FAF1;RIGHTWARDS HAND;So;0;ON;;;;;N;;;;; 1FAF2;LEFTWARDS HAND;So;0;ON;;;;;N;;;;; @@ -39215,14 +39671,15 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 1FBF7;SEGMENTED DIGIT SEVEN;Nd;0;EN; 0037;7;7;7;N;;;;; 1FBF8;SEGMENTED DIGIT EIGHT;Nd;0;EN; 0038;8;8;8;N;;;;; 1FBF9;SEGMENTED DIGIT NINE;Nd;0;EN; 0039;9;9;9;N;;;;; +1FBFA;ALARM BELL SYMBOL;So;0;ON;;;;;N;;;;; 20000;;Lo;0;L;;;;;N;;;;; 2A6DF;;Lo;0;L;;;;;N;;;;; 2A700;;Lo;0;L;;;;;N;;;;; -2B739;;Lo;0;L;;;;;N;;;;; +2B73F;;Lo;0;L;;;;;N;;;;; 2B740;;Lo;0;L;;;;;N;;;;; 2B81D;;Lo;0;L;;;;;N;;;;; 2B820;;Lo;0;L;;;;;N;;;;; -2CEA1;;Lo;0;L;;;;;N;;;;; +2CEAD;;Lo;0;L;;;;;N;;;;; 2CEB0;;Lo;0;L;;;;;N;;;;; 2EBE0;;Lo;0;L;;;;;N;;;;; 2EBF0;;Lo;0;L;;;;;N;;;;; @@ -39773,6 +40230,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;; 3134A;;Lo;0;L;;;;;N;;;;; 31350;;Lo;0;L;;;;;N;;;;; 323AF;;Lo;0;L;;;;;N;;;;; +323B0;;Lo;0;L;;;;;N;;;;; +33479;;Lo;0;L;;;;;N;;;;; E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;; E0020;TAG SPACE;Cf;0;BN;;;;;N;;;;; E0021;TAG EXCLAMATION MARK;Cf;0;BN;;;;;N;;;;; diff --git a/src/libfsm/clone.c b/src/libfsm/clone.c index 2161599ae..068aca1c2 100644 --- a/src/libfsm/clone.c +++ b/src/libfsm/clone.c @@ -179,7 +179,7 @@ static int copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { struct copy_eager_output_ids_env *env = opaque; - if (!fsm_seteageroutput(env->dst, state, id)) { + if (!fsm_eager_output_set(env->dst, state, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/consolidate.c b/src/libfsm/consolidate.c index b7a8905b2..55c3bfd64 100644 --- a/src/libfsm/consolidate.c +++ b/src/libfsm/consolidate.c @@ -294,7 +294,7 @@ consolidate_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opa assert(state < env->mapping_count); const fsm_state_t dst_state = env->mapping[state]; - if (!fsm_seteageroutput(env->dst, dst_state, id)) { + if (!fsm_eager_output_set(env->dst, dst_state, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 9833fd878..3f748aeae 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -280,6 +280,9 @@ fsm_determinise_with_config(struct fsm *nfa, assert(fsm_all(nfa, fsm_isdfa)); #endif + /* This should not be carried over from the NFA. */ + assert(nfa->linkage_info == NULL); + res = FSM_DETERMINISE_WITH_CONFIG_OK; cleanup: @@ -2599,7 +2602,7 @@ remap_eager_output_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { (void)state; struct remap_eager_output_env *env = opaque; - if (!fsm_seteageroutput(env->dst, env->dst_state, id)) { + if (!fsm_eager_output_set(env->dst, env->dst_state, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/eager_output.c b/src/libfsm/eager_output.c index 00fa1b5f0..a6dcdc89f 100644 --- a/src/libfsm/eager_output.c +++ b/src/libfsm/eager_output.c @@ -107,13 +107,13 @@ fsm_eager_output_free(struct fsm *fsm) } int -fsm_seteageroutputonends(struct fsm *fsm, fsm_output_id_t id) +fsm_eager_output_set_on_ends(struct fsm *fsm, fsm_output_id_t id) { assert(fsm != NULL); const size_t count = fsm_countstates(fsm); for (size_t i = 0; i < count; i++) { if (fsm_isend(fsm, i)) { - if (!fsm_seteageroutput(fsm, i, id)) { return 0; } + if (!fsm_eager_output_set(fsm, i, id)) { return 0; } } } return 1; @@ -157,7 +157,7 @@ grow_htab(const struct fsm_alloc *alloc, struct eager_output_htab *htab) } int -fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) +fsm_eager_output_set(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) { assert(fsm != NULL); @@ -172,15 +172,11 @@ fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) const uint64_t mask = info->htab.bucket_count - 1; assert((mask & info->htab.bucket_count) == 0); /* power of 2 */ - /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { const size_t b_i = (hash + probes) & mask; struct eager_output_bucket *b = &info->htab.buckets[b_i]; - /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ - /* __func__, state, b_i, b->state, (void *)b->entry); */ struct eager_output_entry *e = b->entry; - if (e == NULL) { /* empty */ - /* add */ + if (e == NULL) { /* empty, add */ const size_t alloc_sz = sizeof(*e) + DEF_ENTRY_CEIL * sizeof(e->ids[0]); e = f_calloc(fsm->alloc, 1, alloc_sz); @@ -191,8 +187,6 @@ fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) b->state = state; b->entry = e; info->htab.buckets_used++; - /* fprintf(stderr, "%s: buckets_used %zd\n", __func__, info->htab.buckets_used); */ - /* fprintf(stderr, "%s: saved new entry in bucket %zd\n", __func__, b_i); */ } else if (b->state != state) { /* collision */ continue; } @@ -214,7 +208,6 @@ fsm_seteageroutput(struct fsm *fsm, fsm_state_t state, fsm_output_id_t id) } e->ids[e->used++] = id; - /* fprintf(stderr, "%s: e->ids_used %u\n", __func__, e->used); */ fsm->states[state].has_eager_outputs = 1; return 1; } @@ -259,8 +252,6 @@ fsm_eager_output_iter_state(const struct fsm *fsm, for (size_t probes = 0; probes < info->htab.bucket_count; probes++) { const size_t b_i = (hash + probes) & mask; struct eager_output_bucket *b = &info->htab.buckets[b_i]; - /* fprintf(stderr, "%s: state %d -> b_i %zd, state %d, entry %p\n", */ - /* __func__, state, b_i, b->state, (void *)b->entry); */ struct eager_output_entry *e = b->entry; if (e == NULL) { /* empty */ return; @@ -296,7 +287,9 @@ fsm_eager_output_count(const struct fsm *fsm, fsm_state_t state) } struct get_env { + bool ok; size_t count; + size_t ceil; fsm_output_id_t *buf; }; @@ -305,6 +298,10 @@ append_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { struct get_env *env = opaque; (void)state; + if (env->count == env->ceil) { + env->ok = false; + return 0; + } env->buf[env->count++] = id; return 1; } @@ -317,12 +314,18 @@ cmp_fsm_output_id_t(const void *pa, const void *pb) return a < b ? -1 : a > b ? 1 : 0; } -void -fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, fsm_output_id_t *buf) +int +fsm_eager_output_get(const struct fsm *fsm, fsm_state_t state, + size_t buf_count, fsm_output_id_t *id_buf) { - struct get_env env = { .buf = buf }; + struct get_env env = { + .ok = true, + .buf = id_buf, + .ceil = buf_count, + }; fsm_eager_output_iter_state(fsm, state, append_cb, &env); - qsort(buf, env.count, sizeof(buf[0]), cmp_fsm_output_id_t); + qsort(id_buf, env.count, sizeof(id_buf[0]), cmp_fsm_output_id_t); + return env.ok ? 1 : 0; } void @@ -335,12 +338,9 @@ fsm_eager_output_iter_all(const struct fsm *fsm, struct eager_output_info *info = fsm->eager_output_info; - /* fprintf(stderr, "%s: bucket_count %zd\n", __func__, info->htab.bucket_count); */ for (size_t b_i = 0; b_i < info->htab.bucket_count; b_i++) { struct eager_output_bucket *b = &info->htab.buckets[b_i]; struct eager_output_entry *e = b->entry; - /* fprintf(stderr, "%s: b_i %zd, state %d, entry %p\n", */ - /* __func__, b_i, b->state, (void *)b->entry); */ if (e == NULL) { /* empty */ continue; } diff --git a/src/libfsm/eager_output.h b/src/libfsm/eager_output.h index 6093adc9e..b90da935c 100644 --- a/src/libfsm/eager_output.h +++ b/src/libfsm/eager_output.h @@ -1,3 +1,9 @@ +/* + * Copyright 2024 Scott Vokes + * + * See LICENCE for the full copyright terms. + */ + #ifndef EAGER_OUTPUT_H #define EAGER_OUTPUT_H @@ -13,12 +19,15 @@ fsm_eager_output_init(struct fsm *fsm); void fsm_eager_output_free(struct fsm *fsm); +/* Does an FSM have eager outputs? */ bool fsm_eager_output_has_eager_output(const struct fsm *fsm); +/* Does a particular state have eager outputs? */ bool fsm_eager_output_state_has_eager_output(const struct fsm *fsm, fsm_state_t state); +/* Dump eager outputs on an FSM. (For debugging.) */ void fsm_eager_output_dump(FILE *f, const struct fsm *fsm); @@ -28,14 +37,17 @@ fsm_eager_output_dump(FILE *f, const struct fsm *fsm); typedef int fsm_eager_output_iter_cb(fsm_state_t state, fsm_output_id_t id, void *opaque); +/* Iterate over eager outputs on a state. */ void fsm_eager_output_iter_state(const struct fsm *fsm, fsm_state_t state, fsm_eager_output_iter_cb *cb, void *opaque); +/* Iterate over all eager outputs on an FSM. */ void fsm_eager_output_iter_all(const struct fsm *fsm, fsm_eager_output_iter_cb *cb, void *opaque); +/* Compact eager output metadata. */ int fsm_eager_output_compact(struct fsm *fsm, fsm_state_t *mapping, size_t mapping_count); diff --git a/src/libfsm/epsilons.c b/src/libfsm/epsilons.c index 8041c29d3..926e6d9bf 100644 --- a/src/libfsm/epsilons.c +++ b/src/libfsm/epsilons.c @@ -246,7 +246,7 @@ fsm_remove_epsilons(struct fsm *nfa) } for (size_t i = 0; i < eager_output_buf.used; i++) { - if (!fsm_seteageroutput(nfa, s, eager_output_buf.ids[i])) { + if (!fsm_eager_output_set(nfa, s, eager_output_buf.ids[i])) { goto cleanup; } } @@ -291,7 +291,7 @@ fsm_remove_epsilons(struct fsm *nfa) * reachable. This doesn't check that the FROM state is reachable from * the start state (trim will do that soon enough), it's just used to * check which states will become unreachable once epsilon edges are - * removed. We don't need to add eager endids for them, because they + * removed. We don't need to add eager outputs for them, because they * will soon be disconnected from the epsilon-free NFA. */ static void mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_label) @@ -311,9 +311,7 @@ mark_states_reachable_by_label(const struct fsm *nfa, uint64_t *reachable_by_lab struct fsm_state *s = &nfa->states[s_i]; /* Clear the visited flag, it will be used to avoid cycles. */ -#if 1 assert(s->visited == 0); /* stale */ -#endif s->visited = 0; edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &egi); diff --git a/src/libfsm/exec.c b/src/libfsm/exec.c index e74cf7c7c..c3e158930 100644 --- a/src/libfsm/exec.c +++ b/src/libfsm/exec.c @@ -55,7 +55,6 @@ struct check_eager_outputs_for_state_env { static int match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaque) { - /* HACK update the types here once it's working */ (void)state; struct check_eager_outputs_for_state_env *env = opaque; #if LOG_EAGER @@ -68,7 +67,6 @@ match_eager_outputs_for_state_cb(fsm_state_t state, fsm_end_id_t id, void *opaqu static int match_eager_outputs_for_state(const struct fsm *fsm, fsm_state_t state) { - /* HACK update the types here once it's working */ fsm_eager_output_cb *cb = NULL; void *opaque = NULL; fsm_eager_output_get_cb(fsm, &cb, &opaque); diff --git a/src/libfsm/fsm.c b/src/libfsm/fsm.c index c442c8262..866650a9c 100644 --- a/src/libfsm/fsm.c +++ b/src/libfsm/fsm.c @@ -42,6 +42,11 @@ free_contents(struct fsm *fsm) fsm_endid_free(fsm); fsm_eager_output_free(fsm); + if (fsm->linkage_info != NULL) { + state_set_free(fsm->linkage_info->anchored_starts); + state_set_free(fsm->linkage_info->anchored_ends); + f_free(fsm->alloc, fsm->linkage_info); + } f_free(fsm->alloc, fsm->states); } @@ -72,6 +77,7 @@ fsm_new_statealloc(const struct fsm_alloc *alloc, size_t statealloc) new->endcount = 0; new->capture_info = NULL; new->endid_info = NULL; + new->linkage_info = NULL; new->states = f_malloc(new->alloc, new->statealloc * sizeof *new->states); if (new->states == NULL) { @@ -144,6 +150,7 @@ fsm_move(struct fsm *dst, struct fsm *src) dst->capture_info = src->capture_info; dst->endid_info = src->endid_info; dst->eager_output_info = src->eager_output_info; + dst->linkage_info = src->linkage_info; f_free(src->alloc, src); } diff --git a/src/libfsm/internal.h b/src/libfsm/internal.h index 094723fdb..06658a78e 100644 --- a/src/libfsm/internal.h +++ b/src/libfsm/internal.h @@ -18,6 +18,7 @@ struct bm; struct edge_set; struct state_set; struct state_array; +struct linkage_info; /* * The alphabet (Sigma) for libfsm's FSM is arbitrary octets. @@ -80,6 +81,29 @@ struct fsm { struct fsm_capture_info *capture_info; struct endid_info *endid_info; struct eager_output_info *eager_output_info; + struct linkage_info *linkage_info; +}; + +#define LINKAGE_NO_STATE ((fsm_state_t)-1) + +/* Internal structure for storing structural info about an NFA. + * This is currently only used by fsm_union_repeated_pattern_group, + * which needs to identify a couple components of the NFA in order + * to link groups of repeated pattern together correctly. */ +struct linkage_info { + /* The states with a /./ self edge representing the unanchored + * start and end, or LINKAGE_NO_STATE. There can be at most one + * of each. */ + fsm_state_t unanchored_start_loop; + fsm_state_t unanchored_end_loop; + + /* The end state following the unanchored end loop. */ + fsm_state_t unanchored_end_loop_end; + + /* States that link to paths only reachable from the beginning of input. */ + struct state_set *anchored_starts; + /* States leading to an anchored end. */ + struct state_set *anchored_ends; }; struct fsm * diff --git a/src/libfsm/lexer.c b/src/libfsm/lexer.c index 8bd374cec..3bf26b3b6 100644 --- a/src/libfsm/lexer.c +++ b/src/libfsm/lexer.c @@ -15,6 +15,26 @@ static enum lx_token z3(struct lx *lx); static enum lx_token z4(struct lx *lx); static enum lx_token z5(struct lx *lx); +static int +lx_advance_end(struct lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif @@ -35,18 +55,19 @@ lx_getc(struct lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_getc((struct lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -55,10 +76,7 @@ lx_ungetc(struct lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -68,13 +86,20 @@ lx_ungetc(struct lx *lx, int c) } } +/* Get a character from fgetc and push it to the buffer */ int lx_fgetc(struct lx *lx) { assert(lx != NULL); assert(lx->getc_opaque != NULL); - return fgetc(lx->getc_opaque); + const int c = fgetc(lx->getc_opaque); + if (c == EOF) { + lx->c = EOF; + return EOF; + } else { + return c; + } } int @@ -119,6 +144,17 @@ lx_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_dynclear(void *buf_opaque) { @@ -158,44 +194,53 @@ lx_dynfree(void *buf_opaque) static enum lx_token z0(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\n': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return lx->z(lx); + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); case S2: /* e.g. "" */ - lx_ungetc(lx, c); return lx->z = z1, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z1, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return lx->z = z1, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S0: @@ -212,44 +257,40 @@ z0(struct lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z1(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '0': case '1': @@ -268,7 +309,9 @@ z1(struct lx *lx) case '\r': case ' ': state = S4; break; case ']': state = S5; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -284,15 +327,15 @@ z1(struct lx *lx) case '7': case '8': case '9': break; - default: lx_ungetc(lx, c); return TOK_ENDID; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ENDID; } break; case S2: /* e.g. "," */ - lx_ungetc(lx, c); return TOK_COMMA; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_COMMA; case S3: /* e.g. "#" */ - lx_ungetc(lx, c); return lx->z = z0, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z0, lx->z(lx); case S4: /* e.g. "\\x09" */ switch ((unsigned char) c) { @@ -300,16 +343,29 @@ z1(struct lx *lx) case '\n': case '\r': case ' ': break; - default: lx_ungetc(lx, c); return lx->z(lx); + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); } break; case S5: /* e.g. "]" */ - lx_ungetc(lx, c); return lx->z = z5, TOK_CLOSEENDIDS; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, TOK_CLOSEENDIDS; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_ENDID; + case S2: return TOK_COMMA; + case S3: return lx->z = z0, lx->z(lx); + case S4: return TOK_EOF; + case S5: return lx->z = z5, TOK_CLOSEENDIDS; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S3: @@ -325,106 +381,105 @@ z1(struct lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_ENDID; - case S2: return TOK_COMMA; - case S3: return TOK_EOF; - case S4: return TOK_EOF; - case S5: return TOK_CLOSEENDIDS; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z2(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\'': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z5, TOK_LABEL; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, TOK_LABEL; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return lx->z = z5, TOK_LABEL; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_LABEL; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z3(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; case '"': state = S3; break; @@ -450,15 +505,15 @@ z3(struct lx *lx) case 'r': case 't': case 'v': state = S6; break; - default: lx_ungetc(lx, c); return TOK_CHAR; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S2: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S2: /* e.g. "\\x00" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z5, TOK_LABEL; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, TOK_LABEL; case S4: /* e.g. "\\x" */ switch ((unsigned char) c) { @@ -484,7 +539,9 @@ z3(struct lx *lx) case 'd': case 'e': case 'f': state = S7; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -498,14 +555,14 @@ z3(struct lx *lx) case '5': case '6': case '7': break; - default: lx_ungetc(lx, c); return TOK_OCT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S6: /* e.g. "\\f" */ - lx_ungetc(lx, c); return TOK_ESC; + case S6: /* e.g. "\\\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ESC; - case S7: /* e.g. "\\xa" */ + case S7: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -529,76 +586,92 @@ z3(struct lx *lx) case 'd': case 'e': case 'f': break; - default: lx_ungetc(lx, c); return TOK_HEX; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_HEX; } break; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; - case S3: return TOK_LABEL; + case S3: return lx->z = z5, TOK_LABEL; case S5: return TOK_OCT; case S6: return TOK_ESC; case S7: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z4(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\n': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return lx->z(lx); + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); case S2: /* e.g. "" */ - lx_ungetc(lx, c); return lx->z = z5, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z5, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return lx->z = z5, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S0: @@ -615,46 +688,42 @@ z4(struct lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z5(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case ',': state = S1; break; case ';': state = S2; break; @@ -732,31 +801,35 @@ z5(struct lx *lx) case '\n': case '\r': case ' ': state = S13; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S1: /* e.g. "," */ - lx_ungetc(lx, c); return TOK_COMMA; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_COMMA; case S2: /* e.g. ";" */ - lx_ungetc(lx, c); return TOK_SEP; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_SEP; - case S3: /* e.g. "?" */ - lx_ungetc(lx, c); return TOK_ANY; + case S3: /* e.g. "\077" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ANY; case S4: /* e.g. "-" */ switch ((unsigned char) c) { case '>': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S5: /* e.g. "[" */ - lx_ungetc(lx, c); return lx->z = z1, TOK_OPENENDIDS; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENENDIDS; case S6: /* e.g. "=" */ - lx_ungetc(lx, c); return TOK_EQUALS; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_EQUALS; case S7: /* e.g. "e" */ switch ((unsigned char) c) { @@ -823,7 +896,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'n': state = S19; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -892,11 +965,11 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 't': state = S14; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; - case S9: /* e.g. "a" */ + case S9: /* e.g. "0" */ switch ((unsigned char) c) { case '0': case '1': @@ -961,18 +1034,18 @@ z5(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; case S10: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z2, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z2, lx->z(lx); case S11: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z3, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z3, lx->z(lx); case S12: /* e.g. "#" */ - lx_ungetc(lx, c); return lx->z = z4, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, lx->z(lx); case S13: /* e.g. "\\x09" */ switch ((unsigned char) c) { @@ -980,7 +1053,7 @@ z5(struct lx *lx) case '\n': case '\r': case ' ': break; - default: lx_ungetc(lx, c); return lx->z(lx); + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); } break; @@ -1049,7 +1122,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'a': state = S15; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1118,7 +1191,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'r': state = S16; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1187,7 +1260,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 't': state = S17; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1257,12 +1330,12 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case ':': state = S18; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; case S18: /* e.g. "start:" */ - lx_ungetc(lx, c); return TOK_START; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_START; case S19: /* e.g. "en" */ switch ((unsigned char) c) { @@ -1329,7 +1402,7 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case 'd': state = S20; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; @@ -1399,53 +1472,34 @@ z5(struct lx *lx) case 'y': case 'z': state = S9; break; case ':': state = S21; break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; case S21: /* e.g. "end:" */ - lx_ungetc(lx, c); return TOK_END; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_END; case S22: /* e.g. "->" */ - lx_ungetc(lx, c); return TOK_TO; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TO; default: ; /* unreached */ } - - switch (state) { - case S10: - case S11: - case S12: - case S13: - break; - - default: - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } - break; - - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_COMMA; case S2: return TOK_SEP; case S3: return TOK_ANY; - case S5: return TOK_OPENENDIDS; + case S5: return lx->z = z1, TOK_OPENENDIDS; case S6: return TOK_EQUALS; case S7: return TOK_IDENT; case S8: return TOK_IDENT; case S9: return TOK_IDENT; - case S10: return TOK_EOF; - case S11: return TOK_EOF; - case S12: return TOK_EOF; + case S10: return lx->z = z2, lx->z(lx); + case S11: return lx->z = z3, lx->z(lx); + case S12: return lx->z = z4, lx->z(lx); case S13: return TOK_EOF; case S14: return TOK_IDENT; case S15: return TOK_IDENT; @@ -1456,8 +1510,34 @@ z5(struct lx *lx) case S20: return TOK_IDENT; case S21: return TOK_END; case S22: return TOK_TO; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + switch (state) { + case S10: + case S11: + case S12: + case S13: + break; + + default: + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + break; + + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -1640,6 +1720,7 @@ lx_init(struct lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_dynpop; } enum lx_token diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index ab28b0a21..a498ced4d 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -100,10 +100,10 @@ fsm_increndids fsm_endid_dump -fsm_seteageroutput -fsm_seteageroutputonends +fsm_eager_output_set +fsm_eager_output_set_on_ends fsm_eager_output_count -# short term hack +fsm_eager_output_get_cb fsm_eager_output_set_cb fsm_eager_output_dump fsm_eager_output_get diff --git a/src/libfsm/merge.c b/src/libfsm/merge.c index ccc1568ff..267b5b1df 100644 --- a/src/libfsm/merge.c +++ b/src/libfsm/merge.c @@ -214,7 +214,7 @@ static int copy_eager_output_ids_cb(fsm_state_t state, fsm_output_id_t id, void *opaque) { struct copy_eager_output_ids_env *env = opaque; - if (!fsm_seteageroutput(env->dst, state + env->base_src, id)) { + if (!fsm_eager_output_set(env->dst, state + env->base_src, id)) { env->ok = false; return 0; } diff --git a/src/libfsm/minimise_test_oracle.c b/src/libfsm/minimise_test_oracle.c index 20d4633a1..65bd3b43d 100644 --- a/src/libfsm/minimise_test_oracle.c +++ b/src/libfsm/minimise_test_oracle.c @@ -212,7 +212,7 @@ fsm_minimise_test_oracle(const struct fsm *fsm) int eres = fsm_endid_get(fsm, i, endid_count_a, ids_a); assert(eres == 1); - fsm_eager_output_get(fsm, i, eo_ids_a); + fsm_eager_output_get(fsm, i, max_eager_output_count, eo_ids_a); bool found = false; /* note: skipping eg 0 here since that's the empty set */ @@ -234,7 +234,7 @@ fsm_minimise_test_oracle(const struct fsm *fsm) endid_count_b, ids_b); assert(eres == 1); - fsm_eager_output_get(fsm, end_md_group_leaders[eg_i], eo_ids_b); + fsm_eager_output_get(fsm, end_md_group_leaders[eg_i], max_eager_output_count, eo_ids_b); if ((0 == memcmp(ids_a, ids_b, endid_count_a * sizeof(ids_a[0]))) && (0 == memcmp(eo_ids_a, eo_ids_b, eager_output_count_a * sizeof(eo_ids_a[0])))) { diff --git a/src/libfsm/parser.c b/src/libfsm/parser.c index e4ac8a31b..ec9bf4f78 100644 --- a/src/libfsm/parser.c +++ b/src/libfsm/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 153 "src/libfsm/parser.act" +#line 27 "src/libfsm/parser.act" #include @@ -179,7 +179,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: CHAR */ { -#line 247 "src/libfsm/parser.act" +#line 243 "src/libfsm/parser.act" assert(lex_state->buf.a[0] != '\0'); assert(lex_state->buf.a[1] == '\0'); @@ -196,7 +196,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: ESC */ { -#line 171 "src/libfsm/parser.act" +#line 167 "src/libfsm/parser.act" assert(0 == strncmp(lex_state->buf.a, "\\", 1)); assert(2 == strlen(lex_state->buf.a)); @@ -224,7 +224,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: HEX */ { -#line 240 "src/libfsm/parser.act" +#line 214 "src/libfsm/parser.act" unsigned long u; char *e; @@ -263,7 +263,7 @@ p_label(fsm fsm, lex_state lex_state, act_state act_state, char *ZOc) { /* BEGINNING OF EXTRACT: OCT */ { -#line 211 "src/libfsm/parser.act" +#line 185 "src/libfsm/parser.act" unsigned long u; char *e; @@ -338,7 +338,7 @@ ZL2_items:; case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIa = xstrdup(lex_state->buf.a); @@ -366,7 +366,7 @@ ZL2_items:; goto ZL2_items; /* END OF INLINE: items */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -451,7 +451,7 @@ ZL2_xend_C_Cend_Hstates:; } /* END OF INLINE: xend::end-states */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -489,7 +489,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) { /* BEGINNING OF ACTION: err-expected-start */ { -#line 404 "src/libfsm/parser.act" +#line 402 "src/libfsm/parser.act" err_expected(lex_state, "'start:'"); @@ -507,7 +507,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIn = xstrdup(lex_state->buf.a); @@ -530,7 +530,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) } /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((ZIn)); @@ -588,7 +588,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: mark-start */ { -#line 336 "src/libfsm/parser.act" +#line 335 "src/libfsm/parser.act" fsm_setstart(fsm, (ZIs)); @@ -597,7 +597,7 @@ p_xstart(fsm fsm, lex_state lex_state, act_state act_state) /* END OF ACTION: mark-start */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((ZIn)); @@ -639,7 +639,7 @@ p_xend(fsm fsm, lex_state lex_state, act_state act_state) { /* BEGINNING OF ACTION: err-expected-end */ { -#line 408 "src/libfsm/parser.act" +#line 406 "src/libfsm/parser.act" err_expected(lex_state, "'end:'"); @@ -687,7 +687,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIn = xstrdup(lex_state->buf.a); @@ -705,7 +705,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z /* END OF INLINE: ident */ /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((ZIn)); @@ -763,7 +763,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: mark-end */ { -#line 340 "src/libfsm/parser.act" +#line 339 "src/libfsm/parser.act" fsm_setend(fsm, (ZIs), 1); @@ -772,7 +772,7 @@ p_xend_C_Cend_Hstate(fsm fsm, lex_state lex_state, act_state act_state, state *Z /* END OF ACTION: mark-end */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((ZIn)); @@ -810,7 +810,7 @@ p_fsm(fsm fsm, lex_state lex_state, act_state act_state) ADVANCE_LEXER; /* BEGINNING OF ACTION: free-statelist */ { -#line 366 "src/libfsm/parser.act" +#line 353 "src/libfsm/parser.act" struct act_statelist *p; struct act_statelist *next; @@ -834,7 +834,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-syntax */ { -#line 413 "src/libfsm/parser.act" +#line 410 "src/libfsm/parser.act" err(lex_state, "Syntax error"); exit(EXIT_FAILURE); @@ -865,7 +865,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-comma */ { -#line 400 "src/libfsm/parser.act" +#line 398 "src/libfsm/parser.act" err_expected(lex_state, "','"); @@ -895,7 +895,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-sep */ { -#line 392 "src/libfsm/parser.act" +#line 390 "src/libfsm/parser.act" err_expected(lex_state, "';'"); @@ -923,7 +923,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) case (TOK_IDENT): /* BEGINNING OF EXTRACT: IDENT */ { -#line 252 "src/libfsm/parser.act" +#line 250 "src/libfsm/parser.act" /* XXX: don't exit in library code */ ZIb = xstrdup(lex_state->buf.a); @@ -941,7 +941,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF INLINE: ident */ /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((*ZIa)); @@ -999,7 +999,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((ZIb)); @@ -1057,7 +1057,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((*ZIa)); @@ -1066,7 +1066,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: free */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((ZIb)); @@ -1081,7 +1081,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) ADVANCE_LEXER; /* BEGINNING OF ACTION: add-edge-any */ { -#line 376 "src/libfsm/parser.act" +#line 375 "src/libfsm/parser.act" if (!fsm_addedge_any(fsm, (ZIx), (ZIy))) { perror("fsm_addedge_any"); @@ -1104,7 +1104,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) } /* BEGINNING OF ACTION: add-edge-literal */ { -#line 369 "src/libfsm/parser.act" +#line 368 "src/libfsm/parser.act" if (!fsm_addedge_literal(fsm, (ZIx), (ZIy), (ZIc))) { perror("fsm_addedge_literal"); @@ -1120,7 +1120,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) { /* BEGINNING OF ACTION: add-edge-epsilon */ { -#line 383 "src/libfsm/parser.act" +#line 382 "src/libfsm/parser.act" if (!fsm_addedge_epsilon(fsm, (ZIx), (ZIy))) { perror("fsm_addedge_epsilon"); @@ -1138,7 +1138,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) { /* BEGINNING OF ACTION: err-expected-trans */ { -#line 396 "src/libfsm/parser.act" +#line 394 "src/libfsm/parser.act" err_expected(lex_state, "transition"); @@ -1162,7 +1162,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* BEGINNING OF ACTION: add-state */ { -#line 284 "src/libfsm/parser.act" +#line 282 "src/libfsm/parser.act" struct act_statelist *p; const unsigned hash = hash_of_id((*ZIa)); @@ -1220,7 +1220,7 @@ p_78(fsm fsm, lex_state lex_state, act_state act_state, string *ZIa) /* END OF ACTION: add-state */ /* BEGINNING OF ACTION: free */ { -#line 350 "src/libfsm/parser.act" +#line 349 "src/libfsm/parser.act" free((*ZIa)); @@ -1269,7 +1269,7 @@ ZL2_xend_C_Cend_Hids:; } /* END OF INLINE: xend::end-ids */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): RESTORE_LEXER; goto ZL1; @@ -1298,7 +1298,7 @@ p_xend_C_Cend_Hid(fsm fsm, lex_state lex_state, act_state act_state, state ZIs) case (TOK_ENDID): /* BEGINNING OF EXTRACT: ENDID */ { -#line 277 "src/libfsm/parser.act" +#line 255 "src/libfsm/parser.act" unsigned long u; char *e; @@ -1333,7 +1333,7 @@ p_xend_C_Cend_Hid(fsm fsm, lex_state lex_state, act_state act_state, state ZIs) ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-end-id */ { -#line 344 "src/libfsm/parser.act" +#line 343 "src/libfsm/parser.act" if (!fsm_endid_set(fsm, (ZIs), (ZIid))) { goto ZL1; @@ -1351,7 +1351,7 @@ ZL1:; /* BEGINNING OF TRAILER */ -#line 479 "src/libfsm/parser.act" +#line 415 "src/libfsm/parser.act" struct fsm *fsm_parse(FILE *f, const struct fsm_alloc *alloc) { diff --git a/src/libfsm/parser.h b/src/libfsm/parser.h index edeebb112..32f562c66 100644 --- a/src/libfsm/parser.h +++ b/src/libfsm/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 163 "src/libfsm/parser.act" +#line 153 "src/libfsm/parser.act" typedef struct lex_state * lex_state; @@ -27,7 +27,7 @@ extern void p_fsm(fsm, lex_state, act_state); /* BEGINNING OF TRAILER */ -#line 480 "src/libfsm/parser.act" +#line 479 "src/libfsm/parser.act" #line 33 "src/libfsm/parser.h" diff --git a/src/libfsm/print.c b/src/libfsm/print.c index 0615c0d1f..f5fc3b436 100644 --- a/src/libfsm/print.c +++ b/src/libfsm/print.c @@ -111,6 +111,7 @@ int print_hook_reject(FILE *f, const struct fsm_options *opt, const struct fsm_hooks *hooks, + const struct fsm_state_metadata *state_metadata, int (*default_reject)(FILE *f, const struct fsm_options *opt, void *lang_opaque, void *hook_opaque), void *lang_opaque) @@ -124,7 +125,7 @@ print_hook_reject(FILE *f, } if (hooks->reject != NULL) { - return hooks->reject(f, opt, + return hooks->reject(f, opt, state_metadata, lang_opaque, hooks->hook_opaque); } else if (default_reject != NULL) { return default_reject(f, opt, diff --git a/src/libfsm/print.h b/src/libfsm/print.h index e3e485162..6540798ca 100644 --- a/src/libfsm/print.h +++ b/src/libfsm/print.h @@ -44,6 +44,7 @@ int print_hook_reject(FILE *f, const struct fsm_options *opt, const struct fsm_hooks *hooks, + const struct fsm_state_metadata *state_metadata, int (*default_reject)(FILE *f, const struct fsm_options *opt, void *lang_opaque, void *hook_opaque), void *lang_opaque); diff --git a/src/libfsm/print/awk.c b/src/libfsm/print/awk.c index bd78e8ef7..4d952cbef 100644 --- a/src/libfsm/print/awk.c +++ b/src/libfsm/print/awk.c @@ -154,7 +154,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/c.c b/src/libfsm/print/c.c index 3a6778545..ef6ffcfd0 100644 --- a/src/libfsm/print/c.c +++ b/src/libfsm/print/c.c @@ -210,7 +210,7 @@ print_groups(FILE *f, const struct fsm_options *opt, } static int -print_case(FILE *f, const struct ir *ir, +print_case(FILE *f, const struct ir *ir, fsm_state_t state_id, const struct fsm_options *opt, const struct fsm_hooks *hooks, const char *cp, @@ -222,6 +222,12 @@ print_case(FILE *f, const struct ir *ir, assert(f != NULL); assert(cs != NULL); + assert(state_id < ir->n); + const struct fsm_state_metadata state_metadata = { + .end_ids = ir->states[state_id].endids.ids, + .end_id_count = ir->states[state_id].endids.count, + }; + if (cs->eager_outputs != NULL && opt->fragment) { /* If .fragment is set and the state has eager outputs, then emit a call to a * macro (the caller is expected to define). This is a temporary interface. */ @@ -233,7 +239,7 @@ print_case(FILE *f, const struct ir *ir, switch (cs->strategy) { case IR_NONE: fprintf(f, "\t\t\t"); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, &state_metadata, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -262,7 +268,7 @@ print_case(FILE *f, const struct ir *ir, print_groups(f, opt, ir_indexof(ir, cs), cs->u.partial.groups, cs->u.partial.n); fprintf(f, "\t\t\tdefault: "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, &state_metadata, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -293,7 +299,7 @@ print_case(FILE *f, const struct ir *ir, print_ranges(f, opt, cs->u.error.error.ranges, cs->u.error.error.n); fprintf(f, " "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, &state_metadata, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -411,7 +417,7 @@ print_endstates(FILE *f, /* unexpected EOT */ fprintf(f, "\tdefault: "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, NULL)) { return -1; } fprintf(f, "\n"); @@ -448,7 +454,7 @@ fsm_print_cfrag(FILE *f, const struct ir *ir, } fprintf(f, "\n"); - if (-1 == print_case(f, ir, opt, hooks, cp, &ir->states[i])) { + if (-1 == print_case(f, ir, i, opt, hooks, cp, &ir->states[i])) { return -1; } @@ -513,6 +519,12 @@ fsm_print_c_body(FILE *f, const struct ir *ir, break; } + if (hooks->advance != NULL) { + if (-1 == hooks->advance(f, opt, cp, hooks->hook_opaque)) { + return -1; + } + } + if (-1 == fsm_print_cfrag(f, ir, opt, hooks, cp)) { return -1; } diff --git a/src/libfsm/print/dot.c b/src/libfsm/print/dot.c index c1ac9b875..69cce10aa 100644 --- a/src/libfsm/print/dot.c +++ b/src/libfsm/print/dot.c @@ -226,7 +226,7 @@ print_dotfrag(FILE *f, if (!opt->anonymous_states) { fprintf(f, "\t%sS%-2u [ ", prefix, s); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, &s)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, &s)) { return -1; } diff --git a/src/libfsm/print/fsm.c b/src/libfsm/print/fsm.c index 06c7e3403..027531514 100644 --- a/src/libfsm/print/fsm.c +++ b/src/libfsm/print/fsm.c @@ -184,7 +184,7 @@ print_state(FILE *f, const struct fsm_options *opt, const struct fsm_hooks *hook assert(opt != NULL); if (!fsm_isend(fsm, s)) { - if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL, NULL)) { return -1; } } diff --git a/src/libfsm/print/go.c b/src/libfsm/print/go.c index 3575c5b75..1f6b7a0dc 100644 --- a/src/libfsm/print/go.c +++ b/src/libfsm/print/go.c @@ -185,7 +185,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC: assert(op->ret >= retlist->a); diff --git a/src/libfsm/print/ir.c b/src/libfsm/print/ir.c index a18dadbbc..d8e29b2e7 100644 --- a/src/libfsm/print/ir.c +++ b/src/libfsm/print/ir.c @@ -577,7 +577,7 @@ make_ir(const struct fsm *fsm, const struct fsm_options *opt) if (outputs == NULL) { goto error; } - fsm_eager_output_get(fsm, i, outputs->ids); + fsm_eager_output_get(fsm, i, eager_output_count, outputs->ids); outputs->count = eager_output_count; ir->states[i].eager_outputs = outputs; diff --git a/src/libfsm/print/ir.h b/src/libfsm/print/ir.h index 7678d3f35..b4b93a9eb 100644 --- a/src/libfsm/print/ir.h +++ b/src/libfsm/print/ir.h @@ -96,11 +96,7 @@ struct ir_state { } error; struct { - /* Note: This is allocated separately, to avoid - * making the union significantly larger. */ - struct ir_state_table { - unsigned to[FSM_SIGMA_COUNT]; - } *table; + int not_yet_implemented; } table; } u; }; diff --git a/src/libfsm/print/llvm.c b/src/libfsm/print/llvm.c index 14b116555..98e76d46f 100644 --- a/src/libfsm/print/llvm.c +++ b/src/libfsm/print/llvm.c @@ -713,7 +713,7 @@ fsm_print_llvm(FILE *f, fprintf(f, "\n"); } fprintf(f, "\t "); - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, NULL)) { return -1; } fprintf(f, "\n"); diff --git a/src/libfsm/print/rust.c b/src/libfsm/print/rust.c index 682bc051f..c1fcf5bb4 100644 --- a/src/libfsm/print/rust.c +++ b/src/libfsm/print/rust.c @@ -174,7 +174,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC: assert(op->ret >= retlist->a); diff --git a/src/libfsm/print/sh.c b/src/libfsm/print/sh.c index 3c11c2f1f..5322e4ffc 100644 --- a/src/libfsm/print/sh.c +++ b/src/libfsm/print/sh.c @@ -202,7 +202,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; const struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/vmasm.c b/src/libfsm/print/vmasm.c index e29eda24d..bf5551420 100644 --- a/src/libfsm/print/vmasm.c +++ b/src/libfsm/print/vmasm.c @@ -44,7 +44,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, NULL, NULL)) { return -1; } break; diff --git a/src/libfsm/print/vmc.c b/src/libfsm/print/vmc.c index e6d0eaece..b427a9cb8 100644 --- a/src/libfsm/print/vmc.c +++ b/src/libfsm/print/vmc.c @@ -175,7 +175,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/vmdot.c b/src/libfsm/print/vmdot.c index 512cc893b..57344cddb 100644 --- a/src/libfsm/print/vmdot.c +++ b/src/libfsm/print/vmdot.c @@ -126,7 +126,7 @@ print_end(FILE *f, { switch (end_bits) { case VM_END_FAIL: - return print_hook_reject(f, opt, hooks, default_reject, NULL); + return print_hook_reject(f, opt, hooks, NULL, default_reject, NULL); case VM_END_SUCC:; struct fsm_state_metadata state_metadata = { diff --git a/src/libfsm/print/vmops.c b/src/libfsm/print/vmops.c index 59de464f1..74fc6d350 100644 --- a/src/libfsm/print/vmops.c +++ b/src/libfsm/print/vmops.c @@ -146,7 +146,7 @@ print_end(FILE *f, const struct dfavm_op_ir *op, switch (end_bits) { case VM_END_FAIL: - if (-1 == print_hook_reject(f, opt, hooks, default_reject, NULL)) { + if (-1 == print_hook_reject(f, opt, hooks, NULL, default_reject, NULL)) { return -1; } break; diff --git a/src/libfsm/state.c b/src/libfsm/state.c index 8f1146038..786bc3a25 100644 --- a/src/libfsm/state.c +++ b/src/libfsm/state.c @@ -90,7 +90,6 @@ fsm_addstate_bulk(struct fsm *fsm, size_t n) new->visited = 0; new->epsilons = NULL; new->edges = NULL; - new->has_eager_outputs = 0; } diff --git a/src/libfsm/union.c b/src/libfsm/union.c index 9062e1dbd..4b07ca6cf 100644 --- a/src/libfsm/union.c +++ b/src/libfsm/union.c @@ -41,17 +41,19 @@ struct analysis_info { fsm_state_t start; /* start state */ /* The states with a /./ self edge representing the unanchored - * start and end, or NO_STATE. There can be at most one of each. */ + * start and end, or LINKAGE_NO_STATE. There can be at most one + * of each. Copied from linkage_info. */ fsm_state_t unanchored_start_loop; fsm_state_t unanchored_end_loop; - /* The end state following the unanchored end loop. */ + /* The end state following the unanchored end loop. + * Copied from linkage_info.*/ fsm_state_t unanchored_end_loop_end; - /* States that link to paths only reachable from the beginning of input. */ + /* States that link to paths only reachable from the beginning of input. + * Copied from linkage_info. */ struct state_set *anchored_starts; - - /* States leading to an anchored end. */ + /* States leading to an anchored end. Copied from linkage_info. */ struct state_set *anchored_ends; /* States with an outgoing labeled edge to the unanchored end loop. Input @@ -74,6 +76,11 @@ struct analysis_info { /* These states need an epsilon edge added to the eager_matched_state. */ struct state_set *needs_indirect_epsilon_edge_to_eager_match_state; + + /* States which are reachable from any state besides the start + * state. This can be necessary to correctly identify the + * unanchored start loop. */ + struct state_set *reachable_from_nonstart_state; }; struct fsm * @@ -208,25 +215,6 @@ fsm_union_array(size_t fsm_count, return res; } -static bool -has_dot_self_edge(const struct fsm *nfa, fsm_state_t s_i) -{ - const struct fsm_state *s = &nfa->states[s_i]; - - struct edge_group_iter ei; - edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &ei); - struct edge_group_iter_info info; - while (edge_set_group_iter_next(&ei, &info)) { - if (info.to != s_i) { continue; } - for (size_t i = 0; i < 256/64; i++) { - if (info.symbols[i] != (uint64_t)-1) { continue; } - } - return true; - } - - return false; -} - #if LOG_ANALYZE_GROUP_NFA_RESULTS static void dump_state_set(FILE *f, const char *name, const struct state_set *set) @@ -259,101 +247,40 @@ dump_edge_set(FILE *f, const char *name, fsm_state_t from, const struct edge_set } #endif -/* For each state in the epsilon closure, if there's a labeled edge - * to an end state, check if the label set is only [\n] and there's - * also an epsilon edge to the same end state. - * If so, this represents an anchored end in the NFA. */ static bool -has_epsilon_and_newline_edges_to_same_end(const struct fsm *nfa, struct state_set *eclosure, - fsm_state_t s_i, fsm_state_t *dst_end) -{ - struct state_iter si; - state_set_reset(eclosure, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - assert(ns_i < nfa->statecount); - const struct fsm_state *ns = &nfa->states[ns_i]; - - if (state_set_empty(ns->epsilons)) { continue; } - if (edge_set_empty(ns->edges)) { continue; } - - struct edge_group_iter iter; - struct edge_group_iter_info info; - edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - /* Look for an edge set with only '\n' */ - if ((info.symbols[0] != (1ULL << '\n')) - || info.symbols[1] || info.symbols[2] || info.symbols[3]) { - continue; - } - - /* If it's an end, look for an epsilon leeding to the same destination */ - if (fsm_isend(nfa, info.to)) { - struct state_iter inner_si; - fsm_state_t os_i; - - assert(s_i < nfa->statecount); - const struct fsm_state *s = &nfa->states[s_i]; - - state_set_reset(s->epsilons, &inner_si); - while (state_set_next(&inner_si, &os_i)) { - if (os_i == info.to) { - *dst_end = info.to; - return true; - } - } - } - } - } - - return false; -} - -static bool -has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa, - struct state_set **eclosures, - fsm_state_t s_i, fsm_state_t unanchored_end_loop, +state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(const struct fsm *nfa, + fsm_state_t s_i, struct state_set **eclosures, + fsm_state_t unanchored_end_loop, fsm_state_t *indirect_dst) { if (unanchored_end_loop == NO_STATE) { return false; } assert(unanchored_end_loop < nfa->statecount); assert(s_i < nfa->statecount); - const struct state_set *s_eclosure = eclosures[s_i]; - /* For every state in s_i's epsilon closure, check if it has - * a labeled edge to a state with the unanchored_end_loop - * in its epsilon closure. */ - struct state_iter si; - state_set_reset(s_eclosure, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - /* The unanchored_end_loop's self-edge doesn't count here. */ - if (ns_i == unanchored_end_loop) { continue; } - - /* FIXME: this should only apply to the original state, not its epsilon closure...right? */ - if (ns_i != s_i) { continue; } + /* The unanchored_end_loop doesn't count, here. */ + if (s_i == unanchored_end_loop) { return false; } - assert(ns_i < nfa->statecount); - const struct fsm_state *ns = &nfa->states[ns_i]; - struct edge_group_iter iter; - struct edge_group_iter_info info; - edge_set_group_iter_reset(ns->edges, EDGE_GROUP_ITER_ALL, &iter); - while (edge_set_group_iter_next(&iter, &info)) { - assert(info.to < nfa->statecount); - const struct state_set *to_eclosure = eclosures[info.to]; - - struct state_iter dst_si; - state_set_reset(to_eclosure, &dst_si); - fsm_state_t dst_s_i; - while (state_set_next(&dst_si, &dst_s_i)) { - if (dst_s_i == unanchored_end_loop) { - if (info.to != unanchored_end_loop) { - *indirect_dst = info.to; - } - - return true; + /* Check whether the state has a labeled edge to a state with the + * unanchored_end_loop in its epsilon closure. */ + const struct fsm_state *s = &nfa->states[s_i]; + struct edge_group_iter iter; + struct edge_group_iter_info info; + edge_set_group_iter_reset(s->edges, EDGE_GROUP_ITER_ALL, &iter); + while (edge_set_group_iter_next(&iter, &info)) { + assert(info.to < nfa->statecount); + const struct state_set *to_eclosure = eclosures[info.to]; + + struct state_iter dst_si; + state_set_reset(to_eclosure, &dst_si); + fsm_state_t dst_s_i; + while (state_set_next(&dst_si, &dst_s_i)) { + if (dst_s_i == unanchored_end_loop) { + if (info.to != unanchored_end_loop) { + *indirect_dst = info.to; } + + return true; } } } @@ -393,11 +320,7 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) } } - memset(ainfo, 0x00, sizeof(*ainfo)); ainfo->start = NO_STATE; - ainfo->unanchored_start_loop = NO_STATE; - ainfo->unanchored_end_loop = NO_STATE; - ainfo->unanchored_end_loop_end = NO_STATE; ainfo->eager_match_state = NO_STATE; if (!fsm_getstart(nfa, &ainfo->start)) { @@ -412,53 +335,35 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) return false; } - /* First pass: Iterate over the start state's epsilon edges, - * attempting to identify the unanchored start loop and anchored - * start states (if present). - * - * Note: This uses the start state's epsilon set rather than its - * epsilon closure because (by construction) the unanchored - * start loop and anchored start states will both be directly - * connected to the start state. Using the epsilon closure can - * mis-identify the unanchored *end* loop as the start loop, if - * there is a path with only epsilon edges between them. */ - struct state_iter si; - state_set_reset(nfa->states[ainfo->start].epsilons, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - if (ns_i == ainfo->start) { continue; } - - /* If there's a state in the start state's epsilon set that - * has a dot self-edge, it's the unanchored start loop. */ - if (has_dot_self_edge(nfa, ns_i)) { - if (LOG_ANALYZE_GROUP_NFA) { - fprintf(stderr, "%s: unanchored_start_loop found on state %d\n", __func__, ns_i); - } + /* Mark any states that are reachable from any state besides the + * start state -- this means they cannot be the unanchored start + * loop, in cases where the pass below would otherwise detect + * more than one. */ + for (fsm_state_t s_i = 0; s_i < state_count; s_i++) { + if (s_i == ainfo->start) { continue; } - /* TODO: There is only one unanchored start loop, but in obscure cases it may - * be difficult to distinguish between the USL and the unanchored end loop or - * other intermediate .* loops. The real USL will strictly appear before any - * other such loops in the graph. - * - * For now, assert that there is only one, because it's safer to have this - * loudly fail at compile time than produce an incorrect graph. Fuzzing has - * produced some inputs that make this fail, but currently they seem to - * depend on having a '\0' character embedded in the middle, which would - * normally be rejected by this point. */ - assert(ainfo->unanchored_start_loop == NO_STATE - || ainfo->unanchored_start_loop == ns_i); - ainfo->unanchored_start_loop = ns_i; - continue; - } else { - /* Otherwise, a state without a dot self-edge is the anchored start. */ - if (LOG_ANALYZE_GROUP_NFA) { - fprintf(stderr, "%s: anchored_start found on state %d\n", __func__, ns_i); + struct state_iter si; + state_set_reset(nfa->states[s_i].epsilons, &si); + fsm_state_t eps_i; + while (state_set_next(&si, &eps_i)) { + /* Ignore self edges */ + if (eps_i == s_i) { continue; } + + if (!state_set_add(&ainfo->reachable_from_nonstart_state, nfa->alloc, eps_i)) { + return false; } + } - if (!state_set_add(&ainfo->anchored_starts, nfa->alloc, ns_i)) { - goto alloc_fail; + struct edge_group_iter egi; + struct edge_group_iter_info info; + edge_set_group_iter_reset(nfa->states[s_i].edges, EDGE_GROUP_ITER_ALL, &egi); + while (edge_set_group_iter_next(&egi, &info)) { + /* Ignore self edges */ + if (info.to == s_i) { continue; } + + if (!state_set_add(&ainfo->reachable_from_nonstart_state, nfa->alloc, info.to)) { + return false; } - continue; } } @@ -519,37 +424,10 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) * trivially match, but otherwise it would never match. */ ainfo->nullable = start_state_epsilon_closure_matches_empty_string(nfa, eclosures[ainfo->start]); - /* If there's a state with a dot self-edge and an epsilon edge to an end state, it's - * the unanchored end loop. There should only be one. */ - for (size_t s_i = 0; s_i < state_count; s_i++) { - const struct fsm_state *s = &nfa->states[s_i]; - if (has_dot_self_edge(nfa, s_i)) { - struct state_iter si; - state_set_reset(s->epsilons, &si); - fsm_state_t ns_i; - while (state_set_next(&si, &ns_i)) { - if (fsm_isend(nfa, ns_i)) { - assert(ainfo->unanchored_end_loop == NO_STATE); - ainfo->unanchored_end_loop = s_i; - ainfo->unanchored_end_loop_end = ns_i; - break; - } - } - if (ainfo->unanchored_end_loop != NO_STATE) { break; } - } - } - /* Collect states that lead to an anchored end or eager match. */ for (size_t s_i = 0; s_i < state_count; s_i++) { - fsm_state_t dst_end = NO_STATE; - if (has_epsilon_and_newline_edges_to_same_end(nfa, eclosures[s_i], s_i, &dst_end)) { - if (!state_set_add(&ainfo->anchored_ends, nfa->alloc, dst_end)) { - goto alloc_fail; - } - } - fsm_state_t indirect_dst = NO_STATE; - if (has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, eclosures, s_i, ainfo->unanchored_end_loop, &indirect_dst)) { + if (state_has_labeled_edge_to_eclosure_with_unanchored_end_loop(nfa, s_i, eclosures, ainfo->unanchored_end_loop, &indirect_dst)) { if (!state_set_add(&ainfo->eager_matches, nfa->alloc, s_i)) { goto alloc_fail; } @@ -568,13 +446,16 @@ analyze_group_nfa(const struct fsm *nfa, struct analysis_info *ainfo) ainfo->start, ainfo->unanchored_start_loop, ainfo->unanchored_end_loop, ainfo->unanchored_end_loop_end); dump_state_set(stderr, "anchored_ends", ainfo->anchored_ends); dump_state_set(stderr, "eager_matches", ainfo->eager_matches); - dump_edge_set(stderr, "anchored_firsts", ainfo->anchored_start, ainfo->anchored_firsts); dump_edge_set(stderr, "repeatable_firsts", ainfo->unanchored_start_loop, ainfo->repeatable_firsts); } #endif closure_free(nfa, eclosures, state_count); + /* The unanchored start and end loop cannot be the same state. */ + assert(ainfo->unanchored_start_loop == NO_STATE + || ainfo->unanchored_start_loop != ainfo->unanchored_end_loop); + return true; alloc_fail: @@ -651,7 +532,7 @@ modify_group_nfa(struct fsm *nfa, size_t id, struct analysis_info *ainfo, size_t /* Set eager match ID on new eager_match_state. */ const fsm_output_id_t oid = (fsm_output_id_t)(id + id_base); - if (!fsm_seteageroutput(nfa, ainfo->eager_match_state, oid)) { + if (!fsm_eager_output_set(nfa, ainfo->eager_match_state, oid)) { return false; } if (log) { @@ -773,6 +654,7 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) state_set_rebase(&ainfo->anchored_ends, base); state_set_rebase(&ainfo->anchored_starts, base); state_set_rebase(&ainfo->eager_matches, base); + state_set_rebase(&ainfo->reachable_from_nonstart_state, base); edge_set_rebase(&ainfo->anchored_firsts, base); edge_set_rebase(&ainfo->repeatable_firsts, base); @@ -782,10 +664,11 @@ rebase_analysis_info(struct analysis_info *ainfo, fsm_state_t base) static void free_analysis(const struct fsm_alloc *alloc, struct analysis_info *ainfo) { - state_set_free(ainfo->anchored_ends); state_set_free(ainfo->anchored_starts); + state_set_free(ainfo->anchored_ends); state_set_free(ainfo->eager_matches); state_set_free(ainfo->needs_indirect_epsilon_edge_to_eager_match_state); + state_set_free(ainfo->reachable_from_nonstart_state); edge_set_free(alloc, ainfo->anchored_firsts); edge_set_free(alloc, ainfo->repeatable_firsts); } @@ -810,6 +693,16 @@ fsm_union_repeated_pattern_group(size_t nfa_count, errno = EINVAL; return NULL; } + + /* Any NFAs passed to this function must be built with + * an re_comp flag of RE_SAVE_LINKAGE_INFO, because some + * of the info saved during construction informs + * linking. */ + if (nfas[i]->linkage_info == NULL) { + errno = EINVAL; + return NULL; + } + const size_t count = fsm_countstates(nfas[i]); est_total_states += count; } @@ -817,8 +710,26 @@ fsm_union_repeated_pattern_group(size_t nfa_count, for (size_t i = 0; i < nfa_count; i++) { struct fsm *fsm = nfas[i]; + struct analysis_info *ainfo = &ainfos[i]; + + /* Copy these fields over, because fsm->linkage_info will be + * freed during the call to fsm_merge below. */ + { + struct linkage_info *linkage_info = fsm->linkage_info; + + ainfo->unanchored_start_loop = linkage_info->unanchored_start_loop; + ainfo->unanchored_end_loop = linkage_info->unanchored_end_loop; + ainfo->unanchored_end_loop_end = linkage_info->unanchored_end_loop_end; + + /* Transfer ownership of these. */ + ainfo->anchored_starts = linkage_info->anchored_starts; + linkage_info->anchored_starts = NULL; + ainfo->anchored_ends = linkage_info->anchored_ends; + linkage_info->anchored_ends = NULL; + } + /* Identify various states in the NFA that will be relevant to combining. */ - if (!analyze_group_nfa(fsm, &ainfos[i])) { + if (!analyze_group_nfa(fsm, ainfo)) { goto fail; } @@ -838,9 +749,11 @@ fsm_union_repeated_pattern_group(size_t nfa_count, fsm_state_t global_start; if (!fsm_addstate(res, &global_start)) { goto fail; } - /* States linking to the starts of unanchored and anchored subgraphs, respectively. - * Matching other group NFAs loops back to the global_unanchored_start_loop, but - * patterns anchored at the ^start are only reachable via global_anchored_start. */ + /* States linking to the starts of unanchored and anchored + * subgraphs, respectively. Matching group NFAs with unanchored + * ends will loop back to the global_unanchored_start_loop, but + * patterns anchored at the start are only reachable via + * global_anchored_start. */ fsm_state_t global_unanchored_start_loop, global_anchored_start; if (!fsm_addstate(res, &global_unanchored_start_loop)) { goto fail; } if (!fsm_addstate(res, &global_anchored_start)) { goto fail; } @@ -850,27 +763,6 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addstate(res, &global_end)) { goto fail; } if (!fsm_addstate(res, &global_unanchored_end_loop)) { goto fail; } - /* do this later, combining NFAs may rebase the state IDs */ -#if 0 - /* link the start to the global unanchored start loop and anchored start. */ - if (log) { - fprintf(stderr, "link_before: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", - global_start, global_unanchored_start_loop, global_anchored_start); - } - if (!fsm_addedge_epsilon(res, global_start, global_unanchored_start_loop)) { goto fail; } - if (!fsm_addedge_epsilon(res, global_start, global_anchored_start)) { goto fail; } - - /* Link the global unanchored start loop to itself. */ - if (!fsm_addedge_any(res, global_unanchored_start_loop, global_unanchored_start_loop)) { goto fail; } - - /* Link the global unanchored end loop and global end. */ - if (log) { - fprintf(stderr, "link_before: global_unanchored_end_loop %d -> global_end %d (and -> self)\n", global_unanchored_end_loop, global_end); - } - if (!fsm_addedge_any(res, global_unanchored_end_loop, global_unanchored_end_loop)) { goto fail; } - if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } -#endif - if (bases != NULL) { memset(bases, 0x00, nfa_count * sizeof(bases[0])); } @@ -889,7 +781,7 @@ fsm_union_repeated_pattern_group(size_t nfa_count, } assert(ainfo->start < state_count); - /* Call fsm_merge; we really don't care which is which. */ + /* Call fsm_merge; the argument order shouldn't matter. */ struct fsm_combine_info combine_info; struct fsm *merged = fsm_merge(res, fsm, &combine_info); if (merged == NULL) { goto fail; } @@ -1021,7 +913,7 @@ fsm_union_repeated_pattern_group(size_t nfa_count, res = merged; } - /* link the start to the global unanchored start loop and anchored start. */ + /* Link the global start to the global unanchored start loop and anchored start states. */ if (log) { fprintf(stderr, "linking: global_start %d -> global_unanchored_start_loop %d and global_anchored_start %d\n", global_start, global_unanchored_start_loop, global_anchored_start); @@ -1029,7 +921,8 @@ fsm_union_repeated_pattern_group(size_t nfa_count, if (!fsm_addedge_epsilon(res, global_start, global_unanchored_start_loop)) { goto fail; } if (!fsm_addedge_epsilon(res, global_start, global_anchored_start)) { goto fail; } - /* Link the global unanchored start loop to itself. */ + /* Link the global unanchored start loop to itself, so it can + * consume and ignore input preceding each matching group NFA. */ if (!fsm_addedge_any(res, global_unanchored_start_loop, global_unanchored_start_loop)) { goto fail; } /* Link the global unanchored end loop and global end. */ @@ -1050,9 +943,12 @@ fsm_union_repeated_pattern_group(size_t nfa_count, fprintf(stderr, "%s: setting global_start %d and end %d\n", __func__, global_start, global_end); } if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_unanchored_start_loop)) { goto fail; } + + /* Link the global unanchored end loop to the global end, so + * reaching the end of input there is considered a match. */ if (!fsm_addedge_epsilon(res, global_unanchored_end_loop, global_end)) { goto fail; } - /* This needs to be set after merging, because that clears the start state. */ + /* These need to be set after merging, because that clears the start state. */ fsm_setstart(res, global_start); fsm_setend(res, global_end, 1); diff --git a/src/libre/Makefile b/src/libre/Makefile index 67617ab92..36601004a 100644 --- a/src/libre/Makefile +++ b/src/libre/Makefile @@ -10,6 +10,7 @@ SRC += src/libre/ast_new_from_fsm.c SRC += src/libre/ast_rewrite.c SRC += src/libre/ac.c SRC += src/libre/print.c +SRC += src/libre/re_interpolate_groups.c SRC += src/libre/re_strings.c # generated diff --git a/src/libre/ast_compile.c b/src/libre/ast_compile.c index 0ea0b08d8..5d31ef0a2 100644 --- a/src/libre/ast_compile.c +++ b/src/libre/ast_compile.c @@ -20,6 +20,8 @@ #include +#include + #include "class.h" #include "ast.h" #include "ast_compile.h" @@ -258,6 +260,10 @@ intern_start_any_loop(struct comp_env *env) env->start_any_loop = loop; env->has_start_any_loop = 1; + if (env->fsm->linkage_info) { + env->fsm->linkage_info->unanchored_start_loop = loop; + } + return 1; } @@ -290,6 +296,11 @@ intern_end_any_loop(struct comp_env *env) env->end_any_loop = loop; env->has_end_any_loop = 1; + if (env->fsm->linkage_info != NULL) { + env->fsm->linkage_info->unanchored_end_loop = loop; + env->fsm->linkage_info->unanchored_end_loop_end = env->end; + } + return 1; } @@ -327,6 +338,14 @@ intern_end_nl(struct comp_env *env) env->end_nl = end_nl; env->has_end_nl = 1; + + if (env->fsm->linkage_info != NULL) { + if (!state_set_add(&env->fsm->linkage_info->anchored_ends, + env->fsm->alloc, env->end)) { + return 0; + } + } + return 1; } @@ -718,7 +737,7 @@ comp_iter(struct comp_env *env, } #if LOG_LINKAGE - fprintf(stderr, " ---> x: %d, y: %d\n", x, y); + fprintf(stderr, " ---> x: %d, y: %d, type: %s\n", x, y, ast_node_type_name(n->type)); #endif switch (n->type) { @@ -871,6 +890,20 @@ comp_iter(struct comp_env *env, case AST_EXPR_ANCHOR: EPSILON(x, y); + + if (env->fsm->linkage_info != NULL + && x == env->start + && n->u.anchor.type == AST_ANCHOR_START) { + /* This state is directly linked from the global start. */ +#if LOG_LINKAGE + fprintf(stderr, "%s: adding %d to anchored_starts due to start anchor\n", + __func__, y); +#endif + if (!state_set_add(&env->fsm->linkage_info->anchored_starts, + env->fsm->alloc, y)) { + return 0; + } + } break; case AST_EXPR_SUBTRACT: { @@ -973,6 +1006,17 @@ ast_compile(const struct ast *ast, return NULL; } + if (re_flags & RE_SAVE_LINKAGE_INFO) { + struct linkage_info *li = f_malloc(alloc, sizeof(*fsm->linkage_info)); + if (li == NULL) { goto error; } + li->unanchored_start_loop = LINKAGE_NO_STATE; + li->unanchored_end_loop = LINKAGE_NO_STATE; + li->unanchored_end_loop_end = LINKAGE_NO_STATE; + li->anchored_starts = NULL; + li->anchored_ends = NULL; + fsm->linkage_info = li; + } + if (!fsm_addstate(fsm, &x)) { goto error; } diff --git a/src/libre/class.h b/src/libre/class.h index 919571982..79460a9da 100644 --- a/src/libre/class.h +++ b/src/libre/class.h @@ -29,6 +29,7 @@ extern const struct class utf8_Bamum; extern const struct class utf8_Bassa_Vah; extern const struct class utf8_Batak; extern const struct class utf8_Bengali; +extern const struct class utf8_Beria_Erfe; extern const struct class utf8_Bhaiksuki; extern const struct class utf8_Bopomofo; extern const struct class utf8_Brahmi; @@ -153,6 +154,7 @@ extern const struct class utf8_Saurashtra; extern const struct class utf8_Sharada; extern const struct class utf8_Shavian; extern const struct class utf8_Siddham; +extern const struct class utf8_Sidetic; extern const struct class utf8_SignWriting; extern const struct class utf8_Sinhala; extern const struct class utf8_Sogdian; @@ -167,6 +169,7 @@ extern const struct class utf8_Tagbanwa; extern const struct class utf8_Tai_Le; extern const struct class utf8_Tai_Tham; extern const struct class utf8_Tai_Viet; +extern const struct class utf8_Tai_Yo; extern const struct class utf8_Takri; extern const struct class utf8_Tamil; extern const struct class utf8_Tangsa; @@ -178,6 +181,7 @@ extern const struct class utf8_Tibetan; extern const struct class utf8_Tifinagh; extern const struct class utf8_Tirhuta; extern const struct class utf8_Todhri; +extern const struct class utf8_Tolong_Siki; extern const struct class utf8_Toto; extern const struct class utf8_Tulu_Tigalari; extern const struct class utf8_Ugaritic; diff --git a/src/libre/class/utf8_Arabic.c b/src/libre/class/utf8_Arabic.c index 904ab392e..592e06d7d 100644 --- a/src/libre/class/utf8_Arabic.c +++ b/src/libre/class/utf8_Arabic.c @@ -13,21 +13,18 @@ static const struct range ranges[] = { { 0x0671UL, 0x06DCUL }, { 0x06DEUL, 0x06FFUL }, { 0x0750UL, 0x077FUL }, - { 0x0870UL, 0x088EUL }, - { 0x0890UL, 0x0891UL }, + { 0x0870UL, 0x0891UL }, { 0x0897UL, 0x08E1UL }, { 0x08E3UL, 0x08FFUL }, - { 0xFB50UL, 0xFBC2UL }, - { 0xFBD3UL, 0xFD3DUL }, - { 0xFD40UL, 0xFD8FUL }, - { 0xFD92UL, 0xFDC7UL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFB50UL, 0xFD3DUL }, + { 0xFD40UL, 0xFDCFUL }, { 0xFDF0UL, 0xFDFFUL }, { 0xFE70UL, 0xFE74UL }, { 0xFE76UL, 0xFEFCUL }, { 0x10E60UL, 0x10E7EUL }, - { 0x10EC2UL, 0x10EC4UL }, - { 0x10EFCUL, 0x10EFFUL }, + { 0x10EC2UL, 0x10EC7UL }, + { 0x10ED0UL, 0x10ED8UL }, + { 0x10EFAUL, 0x10EFFUL }, { 0x1EE00UL, 0x1EE03UL }, { 0x1EE05UL, 0x1EE1FUL }, { 0x1EE21UL, 0x1EE22UL }, diff --git a/src/libre/class/utf8_Beria_Erfe.c b/src/libre/class/utf8_Beria_Erfe.c new file mode 100644 index 000000000..3bf7b4db7 --- /dev/null +++ b/src/libre/class/utf8_Beria_Erfe.c @@ -0,0 +1,14 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x16EA0UL, 0x16EB8UL }, + { 0x16EBBUL, 0x16ED3UL } +}; + +const struct class utf8_Beria_Erfe = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_Common.c b/src/libre/class/utf8_Common.c index 44e6f3ff2..26c3f54e8 100644 --- a/src/libre/class/utf8_Common.c +++ b/src/libre/class/utf8_Common.c @@ -43,7 +43,7 @@ static const struct range ranges[] = { { 0x2066UL, 0x2070UL }, { 0x2074UL, 0x207EUL }, { 0x2080UL, 0x208EUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0x2100UL, 0x2125UL }, { 0x2127UL, 0x2129UL }, { 0x212CUL, 0x2131UL }, @@ -54,8 +54,7 @@ static const struct range ranges[] = { { 0x2440UL, 0x244AUL }, { 0x2460UL, 0x27FFUL }, { 0x2900UL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2BFFUL }, + { 0x2B76UL, 0x2BFFUL }, { 0x2E00UL, 0x2E5DUL }, { 0x2FF0UL, 0x3004UL }, { 0x3006UL, 0x3006UL }, @@ -101,8 +100,10 @@ static const struct range ranges[] = { { 0x101D0UL, 0x101FCUL }, { 0x102E1UL, 0x102FBUL }, { 0x1BCA0UL, 0x1BCA3UL }, - { 0x1CC00UL, 0x1CCF9UL }, + { 0x1CC00UL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEF0UL }, { 0x1CF50UL, 0x1CFC3UL }, { 0x1D000UL, 0x1D0F5UL }, { 0x1D100UL, 0x1D126UL }, @@ -151,11 +152,10 @@ static const struct range ranges[] = { { 0x1F240UL, 0x1F248UL }, { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, - { 0x1F300UL, 0x1F6D7UL }, + { 0x1F300UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -165,16 +165,18 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F8D0UL, 0x1F8D8UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBF9UL }, + { 0x1FB94UL, 0x1FBFAUL }, { 0xE0001UL, 0xE0001UL }, { 0xE0020UL, 0xE007FUL } }; diff --git a/src/libre/class/utf8_Han.c b/src/libre/class/utf8_Han.c index ce3bbce61..0fdf663e0 100644 --- a/src/libre/class/utf8_Han.c +++ b/src/libre/class/utf8_Han.c @@ -29,16 +29,15 @@ static const struct range ranges[] = { { 0xF900UL, 0xFA6DUL }, { 0xFA70UL, 0xFAD9UL }, { 0x16FE2UL, 0x16FE3UL }, - { 0x16FF0UL, 0x16FF1UL }, + { 0x16FF0UL, 0x16FF6UL }, { 0x20000UL, 0x2A6DFUL }, - { 0x2A700UL, 0x2B739UL }, - { 0x2B740UL, 0x2B81DUL }, - { 0x2B820UL, 0x2CEA1UL }, + { 0x2A700UL, 0x2B81DUL }, + { 0x2B820UL, 0x2CEADUL }, { 0x2CEB0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EE5DUL }, { 0x2F800UL, 0x2FA1DUL }, { 0x30000UL, 0x3134AUL }, - { 0x31350UL, 0x323AFUL }, + { 0x31350UL, 0x33479UL }, { 0x1720UL, 0x1734UL }, { 0x10D00UL, 0x10D27UL }, { 0x10D30UL, 0x10D39UL } diff --git a/src/libre/class/utf8_Inherited.c b/src/libre/class/utf8_Inherited.c index 8e5afc76c..46ee6739a 100644 --- a/src/libre/class/utf8_Inherited.c +++ b/src/libre/class/utf8_Inherited.c @@ -8,7 +8,8 @@ static const struct range ranges[] = { { 0x064BUL, 0x0655UL }, { 0x0670UL, 0x0670UL }, { 0x0951UL, 0x0954UL }, - { 0x1AB0UL, 0x1ACEUL }, + { 0x1AB0UL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1CD0UL, 0x1CD2UL }, { 0x1CD4UL, 0x1CE0UL }, { 0x1CE2UL, 0x1CE8UL }, diff --git a/src/libre/class/utf8_Kannada.c b/src/libre/class/utf8_Kannada.c index c2f9ae6e6..a4f8ebd40 100644 --- a/src/libre/class/utf8_Kannada.c +++ b/src/libre/class/utf8_Kannada.c @@ -12,7 +12,7 @@ static const struct range ranges[] = { { 0x0CC6UL, 0x0CC8UL }, { 0x0CCAUL, 0x0CCDUL }, { 0x0CD5UL, 0x0CD6UL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE3UL }, { 0x0CE6UL, 0x0CEFUL }, { 0x0CF1UL, 0x0CF3UL } diff --git a/src/libre/class/utf8_L.c b/src/libre/class/utf8_L.c index b2c396fcc..e37048bc1 100644 --- a/src/libre/class/utf8_L.c +++ b/src/libre/class/utf8_L.c @@ -53,7 +53,7 @@ static const struct range ranges[] = { { 0x0840UL, 0x0858UL }, { 0x0860UL, 0x086AUL }, { 0x0870UL, 0x0887UL }, - { 0x0889UL, 0x088EUL }, + { 0x0889UL, 0x088FUL }, { 0x08A0UL, 0x08C9UL }, { 0x0904UL, 0x0939UL }, { 0x093DUL, 0x093DUL }, @@ -119,7 +119,7 @@ static const struct range ranges[] = { { 0x0C2AUL, 0x0C39UL }, { 0x0C3DUL, 0x0C3DUL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C61UL }, { 0x0C80UL, 0x0C80UL }, { 0x0C85UL, 0x0C8CUL }, @@ -128,7 +128,7 @@ static const struct range ranges[] = { { 0x0CAAUL, 0x0CB3UL }, { 0x0CB5UL, 0x0CB9UL }, { 0x0CBDUL, 0x0CBDUL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE1UL }, { 0x0CF1UL, 0x0CF2UL }, { 0x0D04UL, 0x0D0CUL }, @@ -314,11 +314,8 @@ static const struct range ranges[] = { { 0xA6A0UL, 0xA6E5UL }, { 0xA717UL, 0xA71FUL }, { 0xA722UL, 0xA788UL }, - { 0xA78BUL, 0xA7CDUL }, - { 0xA7D0UL, 0xA7D1UL }, - { 0xA7D3UL, 0xA7D3UL }, - { 0xA7D5UL, 0xA7DCUL }, - { 0xA7F2UL, 0xA801UL }, + { 0xA78BUL, 0xA7DCUL }, + { 0xA7F1UL, 0xA801UL }, { 0xA803UL, 0xA805UL }, { 0xA807UL, 0xA80AUL }, { 0xA80CUL, 0xA822UL }, @@ -434,6 +431,7 @@ static const struct range ranges[] = { { 0x108F4UL, 0x108F5UL }, { 0x10900UL, 0x10915UL }, { 0x10920UL, 0x10939UL }, + { 0x10940UL, 0x10959UL }, { 0x10980UL, 0x109B7UL }, { 0x109BEUL, 0x109BFUL }, { 0x10A00UL, 0x10A00UL }, @@ -456,7 +454,7 @@ static const struct range ranges[] = { { 0x10D6FUL, 0x10D85UL }, { 0x10E80UL, 0x10EA9UL }, { 0x10EB0UL, 0x10EB1UL }, - { 0x10EC2UL, 0x10EC4UL }, + { 0x10EC2UL, 0x10EC7UL }, { 0x10F00UL, 0x10F1CUL }, { 0x10F27UL, 0x10F27UL }, { 0x10F30UL, 0x10F45UL }, @@ -549,6 +547,7 @@ static const struct range ranges[] = { { 0x11D67UL, 0x11D68UL }, { 0x11D6AUL, 0x11D89UL }, { 0x11D98UL, 0x11D98UL }, + { 0x11DB0UL, 0x11DDBUL }, { 0x11EE0UL, 0x11EF2UL }, { 0x11F02UL, 0x11F02UL }, { 0x11F04UL, 0x11F10UL }, @@ -572,16 +571,19 @@ static const struct range ranges[] = { { 0x16B7DUL, 0x16B8FUL }, { 0x16D40UL, 0x16D6CUL }, { 0x16E40UL, 0x16E7FUL }, + { 0x16EA0UL, 0x16EB8UL }, + { 0x16EBBUL, 0x16ED3UL }, { 0x16F00UL, 0x16F4AUL }, { 0x16F50UL, 0x16F50UL }, { 0x16F93UL, 0x16F9FUL }, { 0x16FE0UL, 0x16FE1UL }, { 0x16FE3UL, 0x16FE3UL }, + { 0x16FF2UL, 0x16FF3UL }, { 0x17000UL, 0x17000UL }, - { 0x187F7UL, 0x187F7UL }, - { 0x18800UL, 0x18CD5UL }, + { 0x187FFUL, 0x18CD5UL }, { 0x18CFFUL, 0x18D00UL }, - { 0x18D08UL, 0x18D08UL }, + { 0x18D1EUL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL }, { 0x1AFF0UL, 0x1AFF3UL }, { 0x1AFF5UL, 0x1AFFBUL }, { 0x1AFFDUL, 0x1AFFEUL }, @@ -636,6 +638,12 @@ static const struct range ranges[] = { { 0x1E4D0UL, 0x1E4EBUL }, { 0x1E5D0UL, 0x1E5EDUL }, { 0x1E5F0UL, 0x1E5F0UL }, + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6E2UL }, + { 0x1E6E4UL, 0x1E6E5UL }, + { 0x1E6E7UL, 0x1E6EDUL }, + { 0x1E6F0UL, 0x1E6F4UL }, + { 0x1E6FEUL, 0x1E6FFUL }, { 0x1E7E0UL, 0x1E7E6UL }, { 0x1E7E8UL, 0x1E7EBUL }, { 0x1E7EDUL, 0x1E7EEUL }, @@ -679,11 +687,10 @@ static const struct range ranges[] = { { 0x20000UL, 0x20000UL }, { 0x2A6DFUL, 0x2A6DFUL }, { 0x2A700UL, 0x2A700UL }, - { 0x2B739UL, 0x2B739UL }, - { 0x2B740UL, 0x2B740UL }, + { 0x2B73FUL, 0x2B740UL }, { 0x2B81DUL, 0x2B81DUL }, { 0x2B820UL, 0x2B820UL }, - { 0x2CEA1UL, 0x2CEA1UL }, + { 0x2CEADUL, 0x2CEADUL }, { 0x2CEB0UL, 0x2CEB0UL }, { 0x2EBE0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EBF0UL }, @@ -692,7 +699,8 @@ static const struct range ranges[] = { { 0x30000UL, 0x30000UL }, { 0x3134AUL, 0x3134AUL }, { 0x31350UL, 0x31350UL }, - { 0x323AFUL, 0x323AFUL } + { 0x323AFUL, 0x323B0UL }, + { 0x33479UL, 0x33479UL } }; const struct class utf8_L = { diff --git a/src/libre/class/utf8_Latin.c b/src/libre/class/utf8_Latin.c index 9f59bd35e..362ab61a6 100644 --- a/src/libre/class/utf8_Latin.c +++ b/src/libre/class/utf8_Latin.c @@ -26,11 +26,8 @@ static const struct range ranges[] = { { 0x2160UL, 0x2188UL }, { 0x2C60UL, 0x2C7FUL }, { 0xA722UL, 0xA787UL }, - { 0xA78BUL, 0xA7CDUL }, - { 0xA7D0UL, 0xA7D1UL }, - { 0xA7D3UL, 0xA7D3UL }, - { 0xA7D5UL, 0xA7DCUL }, - { 0xA7F2UL, 0xA7FFUL }, + { 0xA78BUL, 0xA7DCUL }, + { 0xA7F1UL, 0xA7FFUL }, { 0xAB30UL, 0xAB5AUL }, { 0xAB5CUL, 0xAB64UL }, { 0xAB66UL, 0xAB69UL }, diff --git a/src/libre/class/utf8_Ll.c b/src/libre/class/utf8_Ll.c index 889a1da5e..ad884ead6 100644 --- a/src/libre/class/utf8_Ll.c +++ b/src/libre/class/utf8_Ll.c @@ -148,7 +148,7 @@ static const struct range ranges[] = { { 0x024BUL, 0x024BUL }, { 0x024DUL, 0x024DUL }, { 0x024FUL, 0x0293UL }, - { 0x0295UL, 0x02AFUL }, + { 0x0296UL, 0x02AFUL }, { 0x0371UL, 0x0371UL }, { 0x0373UL, 0x0373UL }, { 0x0377UL, 0x0377UL }, @@ -609,6 +609,7 @@ static const struct range ranges[] = { { 0xA7C8UL, 0xA7C8UL }, { 0xA7CAUL, 0xA7CAUL }, { 0xA7CDUL, 0xA7CDUL }, + { 0xA7CFUL, 0xA7CFUL }, { 0xA7D1UL, 0xA7D1UL }, { 0xA7D3UL, 0xA7D3UL }, { 0xA7D5UL, 0xA7D5UL }, @@ -633,6 +634,7 @@ static const struct range ranges[] = { { 0x10D70UL, 0x10D85UL }, { 0x118C0UL, 0x118DFUL }, { 0x16E60UL, 0x16E7FUL }, + { 0x16EBBUL, 0x16ED3UL }, { 0x1D41AUL, 0x1D433UL }, { 0x1D44EUL, 0x1D454UL }, { 0x1D456UL, 0x1D467UL }, diff --git a/src/libre/class/utf8_Lm.c b/src/libre/class/utf8_Lm.c index 884125481..3052e9072 100644 --- a/src/libre/class/utf8_Lm.c +++ b/src/libre/class/utf8_Lm.c @@ -49,7 +49,7 @@ static const struct range ranges[] = { { 0xA717UL, 0xA71FUL }, { 0xA770UL, 0xA770UL }, { 0xA788UL, 0xA788UL }, - { 0xA7F2UL, 0xA7F4UL }, + { 0xA7F1UL, 0xA7F4UL }, { 0xA7F8UL, 0xA7F9UL }, { 0xA9CFUL, 0xA9CFUL }, { 0xA9E6UL, 0xA9E6UL }, @@ -65,18 +65,22 @@ static const struct range ranges[] = { { 0x107B2UL, 0x107BAUL }, { 0x10D4EUL, 0x10D4EUL }, { 0x10D6FUL, 0x10D6FUL }, + { 0x10EC5UL, 0x10EC5UL }, + { 0x11DD9UL, 0x11DD9UL }, { 0x16B40UL, 0x16B43UL }, { 0x16D40UL, 0x16D42UL }, { 0x16D6BUL, 0x16D6CUL }, { 0x16F93UL, 0x16F9FUL }, { 0x16FE0UL, 0x16FE1UL }, { 0x16FE3UL, 0x16FE3UL }, + { 0x16FF2UL, 0x16FF3UL }, { 0x1AFF0UL, 0x1AFF3UL }, { 0x1AFF5UL, 0x1AFFBUL }, { 0x1AFFDUL, 0x1AFFEUL }, { 0x1E030UL, 0x1E06DUL }, { 0x1E137UL, 0x1E13DUL }, { 0x1E4EBUL, 0x1E4EBUL }, + { 0x1E6FFUL, 0x1E6FFUL }, { 0x1E94BUL, 0x1E94BUL } }; diff --git a/src/libre/class/utf8_Lo.c b/src/libre/class/utf8_Lo.c index 0ba2e7a9b..859b511dc 100644 --- a/src/libre/class/utf8_Lo.c +++ b/src/libre/class/utf8_Lo.c @@ -7,7 +7,7 @@ static const struct range ranges[] = { { 0x00BAUL, 0x00BAUL }, { 0x01BBUL, 0x01BBUL }, { 0x01C0UL, 0x01C3UL }, - { 0x0294UL, 0x0294UL }, + { 0x0294UL, 0x0295UL }, { 0x05D0UL, 0x05EAUL }, { 0x05EFUL, 0x05F2UL }, { 0x0620UL, 0x063FUL }, @@ -27,7 +27,7 @@ static const struct range ranges[] = { { 0x0840UL, 0x0858UL }, { 0x0860UL, 0x086AUL }, { 0x0870UL, 0x0887UL }, - { 0x0889UL, 0x088EUL }, + { 0x0889UL, 0x088FUL }, { 0x08A0UL, 0x08C8UL }, { 0x0904UL, 0x0939UL }, { 0x093DUL, 0x093DUL }, @@ -93,7 +93,7 @@ static const struct range ranges[] = { { 0x0C2AUL, 0x0C39UL }, { 0x0C3DUL, 0x0C3DUL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C61UL }, { 0x0C80UL, 0x0C80UL }, { 0x0C85UL, 0x0C8CUL }, @@ -102,7 +102,7 @@ static const struct range ranges[] = { { 0x0CAAUL, 0x0CB3UL }, { 0x0CB5UL, 0x0CB9UL }, { 0x0CBDUL, 0x0CBDUL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE1UL }, { 0x0CF1UL, 0x0CF2UL }, { 0x0D04UL, 0x0D0CUL }, @@ -331,6 +331,7 @@ static const struct range ranges[] = { { 0x108F4UL, 0x108F5UL }, { 0x10900UL, 0x10915UL }, { 0x10920UL, 0x10939UL }, + { 0x10940UL, 0x10959UL }, { 0x10980UL, 0x109B7UL }, { 0x109BEUL, 0x109BFUL }, { 0x10A00UL, 0x10A00UL }, @@ -352,6 +353,7 @@ static const struct range ranges[] = { { 0x10E80UL, 0x10EA9UL }, { 0x10EB0UL, 0x10EB1UL }, { 0x10EC2UL, 0x10EC4UL }, + { 0x10EC6UL, 0x10EC7UL }, { 0x10F00UL, 0x10F1CUL }, { 0x10F27UL, 0x10F27UL }, { 0x10F30UL, 0x10F45UL }, @@ -443,6 +445,8 @@ static const struct range ranges[] = { { 0x11D67UL, 0x11D68UL }, { 0x11D6AUL, 0x11D89UL }, { 0x11D98UL, 0x11D98UL }, + { 0x11DB0UL, 0x11DD8UL }, + { 0x11DDAUL, 0x11DDBUL }, { 0x11EE0UL, 0x11EF2UL }, { 0x11F02UL, 0x11F02UL }, { 0x11F04UL, 0x11F10UL }, @@ -467,10 +471,10 @@ static const struct range ranges[] = { { 0x16F00UL, 0x16F4AUL }, { 0x16F50UL, 0x16F50UL }, { 0x17000UL, 0x17000UL }, - { 0x187F7UL, 0x187F7UL }, - { 0x18800UL, 0x18CD5UL }, + { 0x187FFUL, 0x18CD5UL }, { 0x18CFFUL, 0x18D00UL }, - { 0x18D08UL, 0x18D08UL }, + { 0x18D1EUL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL }, { 0x1B000UL, 0x1B122UL }, { 0x1B132UL, 0x1B132UL }, { 0x1B150UL, 0x1B152UL }, @@ -489,6 +493,12 @@ static const struct range ranges[] = { { 0x1E4D0UL, 0x1E4EAUL }, { 0x1E5D0UL, 0x1E5EDUL }, { 0x1E5F0UL, 0x1E5F0UL }, + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6E2UL }, + { 0x1E6E4UL, 0x1E6E5UL }, + { 0x1E6E7UL, 0x1E6EDUL }, + { 0x1E6F0UL, 0x1E6F4UL }, + { 0x1E6FEUL, 0x1E6FEUL }, { 0x1E7E0UL, 0x1E7E6UL }, { 0x1E7E8UL, 0x1E7EBUL }, { 0x1E7EDUL, 0x1E7EEUL }, @@ -530,11 +540,10 @@ static const struct range ranges[] = { { 0x20000UL, 0x20000UL }, { 0x2A6DFUL, 0x2A6DFUL }, { 0x2A700UL, 0x2A700UL }, - { 0x2B739UL, 0x2B739UL }, - { 0x2B740UL, 0x2B740UL }, + { 0x2B73FUL, 0x2B740UL }, { 0x2B81DUL, 0x2B81DUL }, { 0x2B820UL, 0x2B820UL }, - { 0x2CEA1UL, 0x2CEA1UL }, + { 0x2CEADUL, 0x2CEADUL }, { 0x2CEB0UL, 0x2CEB0UL }, { 0x2EBE0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EBF0UL }, @@ -543,7 +552,8 @@ static const struct range ranges[] = { { 0x30000UL, 0x30000UL }, { 0x3134AUL, 0x3134AUL }, { 0x31350UL, 0x31350UL }, - { 0x323AFUL, 0x323AFUL } + { 0x323AFUL, 0x323B0UL }, + { 0x33479UL, 0x33479UL } }; const struct class utf8_Lo = { diff --git a/src/libre/class/utf8_Lu.c b/src/libre/class/utf8_Lu.c index 150038e7e..e1f752b18 100644 --- a/src/libre/class/utf8_Lu.c +++ b/src/libre/class/utf8_Lu.c @@ -605,7 +605,10 @@ static const struct range ranges[] = { { 0xA7C4UL, 0xA7C7UL }, { 0xA7C9UL, 0xA7C9UL }, { 0xA7CBUL, 0xA7CCUL }, + { 0xA7CEUL, 0xA7CEUL }, { 0xA7D0UL, 0xA7D0UL }, + { 0xA7D2UL, 0xA7D2UL }, + { 0xA7D4UL, 0xA7D4UL }, { 0xA7D6UL, 0xA7D6UL }, { 0xA7D8UL, 0xA7D8UL }, { 0xA7DAUL, 0xA7DAUL }, @@ -622,6 +625,7 @@ static const struct range ranges[] = { { 0x10D50UL, 0x10D65UL }, { 0x118A0UL, 0x118BFUL }, { 0x16E40UL, 0x16E5FUL }, + { 0x16EA0UL, 0x16EB8UL }, { 0x1D400UL, 0x1D419UL }, { 0x1D434UL, 0x1D44DUL }, { 0x1D468UL, 0x1D481UL }, diff --git a/src/libre/class/utf8_M.c b/src/libre/class/utf8_M.c index f0b3e3af6..ce4a80f2b 100644 --- a/src/libre/class/utf8_M.c +++ b/src/libre/class/utf8_M.c @@ -139,7 +139,8 @@ static const struct range ranges[] = { { 0x1A55UL, 0x1A5EUL }, { 0x1A60UL, 0x1A7CUL }, { 0x1A7FUL, 0x1A7FUL }, - { 0x1AB0UL, 0x1ACEUL }, + { 0x1AB0UL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1B00UL, 0x1B04UL }, { 0x1B34UL, 0x1B44UL }, { 0x1B6BUL, 0x1B73UL }, @@ -205,7 +206,7 @@ static const struct range ranges[] = { { 0x10D24UL, 0x10D27UL }, { 0x10D69UL, 0x10D6DUL }, { 0x10EABUL, 0x10EACUL }, - { 0x10EFCUL, 0x10EFFUL }, + { 0x10EFAUL, 0x10EFFUL }, { 0x10F46UL, 0x10F50UL }, { 0x10F82UL, 0x10F85UL }, { 0x11000UL, 0x11002UL }, @@ -267,6 +268,7 @@ static const struct range ranges[] = { { 0x11A47UL, 0x11A47UL }, { 0x11A51UL, 0x11A5BUL }, { 0x11A8AUL, 0x11A99UL }, + { 0x11B60UL, 0x11B67UL }, { 0x11C2FUL, 0x11C36UL }, { 0x11C38UL, 0x11C3FUL }, { 0x11C92UL, 0x11CA7UL }, @@ -321,6 +323,10 @@ static const struct range ranges[] = { { 0x1E2ECUL, 0x1E2EFUL }, { 0x1E4ECUL, 0x1E4EFUL }, { 0x1E5EEUL, 0x1E5EFUL }, + { 0x1E6E3UL, 0x1E6E3UL }, + { 0x1E6E6UL, 0x1E6E6UL }, + { 0x1E6EEUL, 0x1E6EFUL }, + { 0x1E6F5UL, 0x1E6F5UL }, { 0x1E8D0UL, 0x1E8D6UL }, { 0x1E944UL, 0x1E94AUL }, { 0xE0100UL, 0xE01EFUL } diff --git a/src/libre/class/utf8_Mc.c b/src/libre/class/utf8_Mc.c index 77917668c..aa391dcb9 100644 --- a/src/libre/class/utf8_Mc.c +++ b/src/libre/class/utf8_Mc.c @@ -175,6 +175,9 @@ static const struct range ranges[] = { { 0x11A39UL, 0x11A39UL }, { 0x11A57UL, 0x11A58UL }, { 0x11A97UL, 0x11A97UL }, + { 0x11B61UL, 0x11B61UL }, + { 0x11B65UL, 0x11B65UL }, + { 0x11B67UL, 0x11B67UL }, { 0x11C2FUL, 0x11C2FUL }, { 0x11C3EUL, 0x11C3EUL }, { 0x11CA9UL, 0x11CA9UL }, diff --git a/src/libre/class/utf8_Mn.c b/src/libre/class/utf8_Mn.c index c204fa592..4ad1f3ddd 100644 --- a/src/libre/class/utf8_Mn.c +++ b/src/libre/class/utf8_Mn.c @@ -145,7 +145,8 @@ static const struct range ranges[] = { { 0x1A73UL, 0x1A7CUL }, { 0x1A7FUL, 0x1A7FUL }, { 0x1AB0UL, 0x1ABDUL }, - { 0x1ABFUL, 0x1ACEUL }, + { 0x1ABFUL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1B00UL, 0x1B03UL }, { 0x1B34UL, 0x1B34UL }, { 0x1B36UL, 0x1B3AUL }, @@ -227,7 +228,7 @@ static const struct range ranges[] = { { 0x10D24UL, 0x10D27UL }, { 0x10D69UL, 0x10D6DUL }, { 0x10EABUL, 0x10EACUL }, - { 0x10EFCUL, 0x10EFFUL }, + { 0x10EFAUL, 0x10EFFUL }, { 0x10F46UL, 0x10F50UL }, { 0x10F82UL, 0x10F85UL }, { 0x11001UL, 0x11001UL }, @@ -302,6 +303,9 @@ static const struct range ranges[] = { { 0x11A59UL, 0x11A5BUL }, { 0x11A8AUL, 0x11A96UL }, { 0x11A98UL, 0x11A99UL }, + { 0x11B60UL, 0x11B60UL }, + { 0x11B62UL, 0x11B64UL }, + { 0x11B66UL, 0x11B66UL }, { 0x11C30UL, 0x11C36UL }, { 0x11C38UL, 0x11C3DUL }, { 0x11C3FUL, 0x11C3FUL }, @@ -357,6 +361,10 @@ static const struct range ranges[] = { { 0x1E2ECUL, 0x1E2EFUL }, { 0x1E4ECUL, 0x1E4EFUL }, { 0x1E5EEUL, 0x1E5EFUL }, + { 0x1E6E3UL, 0x1E6E3UL }, + { 0x1E6E6UL, 0x1E6E6UL }, + { 0x1E6EEUL, 0x1E6EFUL }, + { 0x1E6F5UL, 0x1E6F5UL }, { 0x1E8D0UL, 0x1E8D6UL }, { 0x1E944UL, 0x1E94AUL }, { 0xE0100UL, 0xE01EFUL } diff --git a/src/libre/class/utf8_N.c b/src/libre/class/utf8_N.c index a4b7fdebc..fa7dcc037 100644 --- a/src/libre/class/utf8_N.c +++ b/src/libre/class/utf8_N.c @@ -119,6 +119,7 @@ static const struct range ranges[] = { { 0x11C50UL, 0x11C6CUL }, { 0x11D50UL, 0x11D59UL }, { 0x11DA0UL, 0x11DA9UL }, + { 0x11DE0UL, 0x11DE9UL }, { 0x11F50UL, 0x11F59UL }, { 0x11FC0UL, 0x11FD4UL }, { 0x12400UL, 0x1246EUL }, @@ -129,6 +130,7 @@ static const struct range ranges[] = { { 0x16B5BUL, 0x16B61UL }, { 0x16D70UL, 0x16D79UL }, { 0x16E80UL, 0x16E96UL }, + { 0x16FF4UL, 0x16FF6UL }, { 0x1CCF0UL, 0x1CCF9UL }, { 0x1D2C0UL, 0x1D2D3UL }, { 0x1D2E0UL, 0x1D2F3UL }, diff --git a/src/libre/class/utf8_Nd.c b/src/libre/class/utf8_Nd.c index 7506f2d2e..19822a94b 100644 --- a/src/libre/class/utf8_Nd.c +++ b/src/libre/class/utf8_Nd.c @@ -60,6 +60,7 @@ static const struct range ranges[] = { { 0x11C50UL, 0x11C59UL }, { 0x11D50UL, 0x11D59UL }, { 0x11DA0UL, 0x11DA9UL }, + { 0x11DE0UL, 0x11DE9UL }, { 0x11F50UL, 0x11F59UL }, { 0x16130UL, 0x16139UL }, { 0x16A60UL, 0x16A69UL }, diff --git a/src/libre/class/utf8_Nl.c b/src/libre/class/utf8_Nl.c index fdf83eb7f..71097a550 100644 --- a/src/libre/class/utf8_Nl.c +++ b/src/libre/class/utf8_Nl.c @@ -14,7 +14,8 @@ static const struct range ranges[] = { { 0x10341UL, 0x10341UL }, { 0x1034AUL, 0x1034AUL }, { 0x103D1UL, 0x103D5UL }, - { 0x12400UL, 0x1246EUL } + { 0x12400UL, 0x1246EUL }, + { 0x16FF4UL, 0x16FF6UL } }; const struct class utf8_Nl = { diff --git a/src/libre/class/utf8_P.c b/src/libre/class/utf8_P.c index 9c506bd13..664a1d28c 100644 --- a/src/libre/class/utf8_P.c +++ b/src/libre/class/utf8_P.c @@ -151,6 +151,7 @@ static const struct range ranges[] = { { 0x10B99UL, 0x10B9CUL }, { 0x10D6EUL, 0x10D6EUL }, { 0x10EADUL, 0x10EADUL }, + { 0x10ED0UL, 0x10ED0UL }, { 0x10F55UL, 0x10F59UL }, { 0x10F86UL, 0x10F89UL }, { 0x11047UL, 0x1104DUL }, diff --git a/src/libre/class/utf8_Po.c b/src/libre/class/utf8_Po.c index ef95b8586..4b20947c0 100644 --- a/src/libre/class/utf8_Po.c +++ b/src/libre/class/utf8_Po.c @@ -146,6 +146,7 @@ static const struct range ranges[] = { { 0x10AF0UL, 0x10AF6UL }, { 0x10B39UL, 0x10B3FUL }, { 0x10B99UL, 0x10B9CUL }, + { 0x10ED0UL, 0x10ED0UL }, { 0x10F55UL, 0x10F59UL }, { 0x10F86UL, 0x10F89UL }, { 0x11047UL, 0x1104DUL }, diff --git a/src/libre/class/utf8_S.c b/src/libre/class/utf8_S.c index 0b69e43fe..9b66a152c 100644 --- a/src/libre/class/utf8_S.c +++ b/src/libre/class/utf8_S.c @@ -75,7 +75,7 @@ static const struct range ranges[] = { { 0x2052UL, 0x2052UL }, { 0x207AUL, 0x207CUL }, { 0x208AUL, 0x208CUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0x2100UL, 0x2101UL }, { 0x2103UL, 0x2106UL }, { 0x2108UL, 0x2109UL }, @@ -103,8 +103,7 @@ static const struct range ranges[] = { { 0x2999UL, 0x29D7UL }, { 0x29DCUL, 0x29FBUL }, { 0x29FEUL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2BFFUL }, + { 0x2B76UL, 0x2BFFUL }, { 0x2CE5UL, 0x2CEAUL }, { 0x2E50UL, 0x2E51UL }, { 0x2E80UL, 0x2E99UL }, @@ -138,9 +137,10 @@ static const struct range ranges[] = { { 0xAB5BUL, 0xAB5BUL }, { 0xAB6AUL, 0xAB6BUL }, { 0xFB29UL, 0xFB29UL }, - { 0xFBB2UL, 0xFBC2UL }, + { 0xFBB2UL, 0xFBD2UL }, { 0xFD40UL, 0xFD4FUL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFD90UL, 0xFD91UL }, + { 0xFDC8UL, 0xFDCFUL }, { 0xFDFCUL, 0xFDFFUL }, { 0xFE62UL, 0xFE62UL }, { 0xFE64UL, 0xFE66UL }, @@ -164,13 +164,17 @@ static const struct range ranges[] = { { 0x10877UL, 0x10878UL }, { 0x10AC8UL, 0x10AC8UL }, { 0x10D8EUL, 0x10D8FUL }, + { 0x10ED1UL, 0x10ED8UL }, { 0x1173FUL, 0x1173FUL }, { 0x11FD5UL, 0x11FF1UL }, { 0x16B3CUL, 0x16B3FUL }, { 0x16B45UL, 0x16B45UL }, { 0x1BC9CUL, 0x1BC9CUL }, { 0x1CC00UL, 0x1CCEFUL }, + { 0x1CCFAUL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEF0UL }, { 0x1CF50UL, 0x1CFC3UL }, { 0x1D000UL, 0x1D0F5UL }, { 0x1D100UL, 0x1D126UL }, @@ -215,11 +219,10 @@ static const struct range ranges[] = { { 0x1F240UL, 0x1F248UL }, { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, - { 0x1F300UL, 0x1F6D7UL }, + { 0x1F300UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -229,16 +232,19 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F8D0UL, 0x1F8D8UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBEFUL } + { 0x1FB94UL, 0x1FBEFUL }, + { 0x1FBFAUL, 0x1FBFAUL } }; const struct class utf8_S = { diff --git a/src/libre/class/utf8_Sc.c b/src/libre/class/utf8_Sc.c index 0effc2b86..3bba263b8 100644 --- a/src/libre/class/utf8_Sc.c +++ b/src/libre/class/utf8_Sc.c @@ -14,7 +14,7 @@ static const struct range ranges[] = { { 0x0BF9UL, 0x0BF9UL }, { 0x0E3FUL, 0x0E3FUL }, { 0x17DBUL, 0x17DBUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0xA838UL, 0xA838UL }, { 0xFDFCUL, 0xFDFCUL }, { 0xFE69UL, 0xFE69UL }, diff --git a/src/libre/class/utf8_Sharada.c b/src/libre/class/utf8_Sharada.c index e51b2edff..dbfa21d3f 100644 --- a/src/libre/class/utf8_Sharada.c +++ b/src/libre/class/utf8_Sharada.c @@ -3,7 +3,8 @@ #include "class.h" static const struct range ranges[] = { - { 0x11180UL, 0x111DFUL } + { 0x11180UL, 0x111DFUL }, + { 0x11B60UL, 0x11B67UL } }; const struct class utf8_Sharada = { diff --git a/src/libre/class/utf8_Sidetic.c b/src/libre/class/utf8_Sidetic.c new file mode 100644 index 000000000..ed53b543d --- /dev/null +++ b/src/libre/class/utf8_Sidetic.c @@ -0,0 +1,13 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x10940UL, 0x10959UL } +}; + +const struct class utf8_Sidetic = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_Sm.c b/src/libre/class/utf8_Sm.c index 9c39b8003..e157e5f8c 100644 --- a/src/libre/class/utf8_Sm.c +++ b/src/libre/class/utf8_Sm.c @@ -57,6 +57,7 @@ static const struct range ranges[] = { { 0xFFE2UL, 0xFFE2UL }, { 0xFFE9UL, 0xFFECUL }, { 0x10D8EUL, 0x10D8FUL }, + { 0x1CEF0UL, 0x1CEF0UL }, { 0x1D6C1UL, 0x1D6C1UL }, { 0x1D6DBUL, 0x1D6DBUL }, { 0x1D6FBUL, 0x1D6FBUL }, @@ -67,7 +68,8 @@ static const struct range ranges[] = { { 0x1D789UL, 0x1D789UL }, { 0x1D7A9UL, 0x1D7A9UL }, { 0x1D7C3UL, 0x1D7C3UL }, - { 0x1EEF0UL, 0x1EEF1UL } + { 0x1EEF0UL, 0x1EEF1UL }, + { 0x1F8D0UL, 0x1F8D8UL } }; const struct class utf8_Sm = { diff --git a/src/libre/class/utf8_So.c b/src/libre/class/utf8_So.c index d7e608ca4..93ca7e36b 100644 --- a/src/libre/class/utf8_So.c +++ b/src/libre/class/utf8_So.c @@ -82,8 +82,7 @@ static const struct range ranges[] = { { 0x2B00UL, 0x2B2FUL }, { 0x2B45UL, 0x2B46UL }, { 0x2B4DUL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2BFFUL }, + { 0x2B76UL, 0x2BFFUL }, { 0x2CE5UL, 0x2CEAUL }, { 0x2E50UL, 0x2E51UL }, { 0x2E80UL, 0x2E99UL }, @@ -111,8 +110,10 @@ static const struct range ranges[] = { { 0xA836UL, 0xA837UL }, { 0xA839UL, 0xA839UL }, { 0xAA77UL, 0xAA79UL }, + { 0xFBC3UL, 0xFBD2UL }, { 0xFD40UL, 0xFD4FUL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFD90UL, 0xFD91UL }, + { 0xFDC8UL, 0xFDCFUL }, { 0xFDFDUL, 0xFDFFUL }, { 0xFFE4UL, 0xFFE4UL }, { 0xFFE8UL, 0xFFE8UL }, @@ -126,6 +127,7 @@ static const struct range ranges[] = { { 0x101D0UL, 0x101FCUL }, { 0x10877UL, 0x10878UL }, { 0x10AC8UL, 0x10AC8UL }, + { 0x10ED1UL, 0x10ED8UL }, { 0x1173FUL, 0x1173FUL }, { 0x11FD5UL, 0x11FDCUL }, { 0x11FE1UL, 0x11FF1UL }, @@ -133,7 +135,10 @@ static const struct range ranges[] = { { 0x16B45UL, 0x16B45UL }, { 0x1BC9CUL, 0x1BC9CUL }, { 0x1CC00UL, 0x1CCEFUL }, + { 0x1CCFAUL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEEFUL }, { 0x1CF50UL, 0x1CFC3UL }, { 0x1D000UL, 0x1D0F5UL }, { 0x1D100UL, 0x1D126UL }, @@ -166,11 +171,10 @@ static const struct range ranges[] = { { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, { 0x1F300UL, 0x1F3FAUL }, - { 0x1F400UL, 0x1F6D7UL }, + { 0x1F400UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -180,16 +184,18 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBEFUL } + { 0x1FB94UL, 0x1FBEFUL }, + { 0x1FBFAUL, 0x1FBFAUL } }; const struct class utf8_So = { diff --git a/src/libre/class/utf8_Tai_Yo.c b/src/libre/class/utf8_Tai_Yo.c new file mode 100644 index 000000000..643d73fe2 --- /dev/null +++ b/src/libre/class/utf8_Tai_Yo.c @@ -0,0 +1,15 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6F5UL }, + { 0x1E6FEUL, 0x1E6FFUL } +}; + +const struct class utf8_Tai_Yo = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_Tangut.c b/src/libre/class/utf8_Tangut.c index cbd82c907..6bf5e5687 100644 --- a/src/libre/class/utf8_Tangut.c +++ b/src/libre/class/utf8_Tangut.c @@ -4,9 +4,9 @@ static const struct range ranges[] = { { 0x16FE0UL, 0x16FE0UL }, - { 0x17000UL, 0x187F7UL }, - { 0x18800UL, 0x18AFFUL }, - { 0x18D00UL, 0x18D08UL } + { 0x17000UL, 0x18AFFUL }, + { 0x18D00UL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL } }; const struct class utf8_Tangut = { diff --git a/src/libre/class/utf8_Telugu.c b/src/libre/class/utf8_Telugu.c index a2c65ae49..1b80f5e1f 100644 --- a/src/libre/class/utf8_Telugu.c +++ b/src/libre/class/utf8_Telugu.c @@ -12,7 +12,7 @@ static const struct range ranges[] = { { 0x0C4AUL, 0x0C4DUL }, { 0x0C55UL, 0x0C56UL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C63UL }, { 0x0C66UL, 0x0C6FUL }, { 0x0C77UL, 0x0C7FUL } diff --git a/src/libre/class/utf8_Tolong_Siki.c b/src/libre/class/utf8_Tolong_Siki.c new file mode 100644 index 000000000..e670a957b --- /dev/null +++ b/src/libre/class/utf8_Tolong_Siki.c @@ -0,0 +1,14 @@ +/* generated */ + +#include "class.h" + +static const struct range ranges[] = { + { 0x11DB0UL, 0x11DDBUL }, + { 0x11DE0UL, 0x11DE9UL } +}; + +const struct class utf8_Tolong_Siki = { + ranges, + sizeof ranges / sizeof *ranges +}; + diff --git a/src/libre/class/utf8_assigned.c b/src/libre/class/utf8_assigned.c index d38345570..32b689a26 100644 --- a/src/libre/class/utf8_assigned.c +++ b/src/libre/class/utf8_assigned.c @@ -24,8 +24,7 @@ static const struct range ranges[] = { { 0x0840UL, 0x085BUL }, { 0x085EUL, 0x085EUL }, { 0x0860UL, 0x086AUL }, - { 0x0870UL, 0x088EUL }, - { 0x0890UL, 0x0891UL }, + { 0x0870UL, 0x0891UL }, { 0x0897UL, 0x0983UL }, { 0x0985UL, 0x098CUL }, { 0x098FUL, 0x0990UL }, @@ -109,7 +108,7 @@ static const struct range ranges[] = { { 0x0C4AUL, 0x0C4DUL }, { 0x0C55UL, 0x0C56UL }, { 0x0C58UL, 0x0C5AUL }, - { 0x0C5DUL, 0x0C5DUL }, + { 0x0C5CUL, 0x0C5DUL }, { 0x0C60UL, 0x0C63UL }, { 0x0C66UL, 0x0C6FUL }, { 0x0C77UL, 0x0C8CUL }, @@ -121,7 +120,7 @@ static const struct range ranges[] = { { 0x0CC6UL, 0x0CC8UL }, { 0x0CCAUL, 0x0CCDUL }, { 0x0CD5UL, 0x0CD6UL }, - { 0x0CDDUL, 0x0CDEUL }, + { 0x0CDCUL, 0x0CDEUL }, { 0x0CE0UL, 0x0CE3UL }, { 0x0CE6UL, 0x0CEFUL }, { 0x0CF1UL, 0x0CF3UL }, @@ -216,7 +215,8 @@ static const struct range ranges[] = { { 0x1A7FUL, 0x1A89UL }, { 0x1A90UL, 0x1A99UL }, { 0x1AA0UL, 0x1AADUL }, - { 0x1AB0UL, 0x1ACEUL }, + { 0x1AB0UL, 0x1ADDUL }, + { 0x1AE0UL, 0x1AEBUL }, { 0x1B00UL, 0x1B4CUL }, { 0x1B4EUL, 0x1BF3UL }, { 0x1BFCUL, 0x1C37UL }, @@ -245,14 +245,13 @@ static const struct range ranges[] = { { 0x2066UL, 0x2071UL }, { 0x2074UL, 0x208EUL }, { 0x2090UL, 0x209CUL }, - { 0x20A0UL, 0x20C0UL }, + { 0x20A0UL, 0x20C1UL }, { 0x20D0UL, 0x20F0UL }, { 0x2100UL, 0x218BUL }, { 0x2190UL, 0x2429UL }, { 0x2440UL, 0x244AUL }, { 0x2460UL, 0x2B73UL }, - { 0x2B76UL, 0x2B95UL }, - { 0x2B97UL, 0x2CF3UL }, + { 0x2B76UL, 0x2CF3UL }, { 0x2CF9UL, 0x2D25UL }, { 0x2D27UL, 0x2D27UL }, { 0x2D2DUL, 0x2D2DUL }, @@ -284,11 +283,8 @@ static const struct range ranges[] = { { 0xA490UL, 0xA4C6UL }, { 0xA4D0UL, 0xA62BUL }, { 0xA640UL, 0xA6F7UL }, - { 0xA700UL, 0xA7CDUL }, - { 0xA7D0UL, 0xA7D1UL }, - { 0xA7D3UL, 0xA7D3UL }, - { 0xA7D5UL, 0xA7DCUL }, - { 0xA7F2UL, 0xA82CUL }, + { 0xA700UL, 0xA7DCUL }, + { 0xA7F1UL, 0xA82CUL }, { 0xA830UL, 0xA839UL }, { 0xA840UL, 0xA877UL }, { 0xA880UL, 0xA8C5UL }, @@ -328,10 +324,7 @@ static const struct range ranges[] = { { 0xFB3EUL, 0xFB3EUL }, { 0xFB40UL, 0xFB41UL }, { 0xFB43UL, 0xFB44UL }, - { 0xFB46UL, 0xFBC2UL }, - { 0xFBD3UL, 0xFD8FUL }, - { 0xFD92UL, 0xFDC7UL }, - { 0xFDCFUL, 0xFDCFUL }, + { 0xFB46UL, 0xFDCFUL }, { 0xFDF0UL, 0xFE19UL }, { 0xFE20UL, 0xFE52UL }, { 0xFE54UL, 0xFE66UL }, @@ -402,7 +395,7 @@ static const struct range ranges[] = { { 0x108F4UL, 0x108F5UL }, { 0x108FBUL, 0x1091BUL }, { 0x1091FUL, 0x10939UL }, - { 0x1093FUL, 0x1093FUL }, + { 0x1093FUL, 0x10959UL }, { 0x10980UL, 0x109B7UL }, { 0x109BCUL, 0x109CFUL }, { 0x109D2UL, 0x10A03UL }, @@ -434,8 +427,9 @@ static const struct range ranges[] = { { 0x10E80UL, 0x10EA9UL }, { 0x10EABUL, 0x10EADUL }, { 0x10EB0UL, 0x10EB1UL }, - { 0x10EC2UL, 0x10EC4UL }, - { 0x10EFCUL, 0x10F27UL }, + { 0x10EC2UL, 0x10EC7UL }, + { 0x10ED0UL, 0x10ED8UL }, + { 0x10EFAUL, 0x10F27UL }, { 0x10F30UL, 0x10F59UL }, { 0x10F70UL, 0x10F89UL }, { 0x10FB0UL, 0x10FCBUL }, @@ -518,6 +512,7 @@ static const struct range ranges[] = { { 0x11A50UL, 0x11AA2UL }, { 0x11AB0UL, 0x11AF8UL }, { 0x11B00UL, 0x11B09UL }, + { 0x11B60UL, 0x11B67UL }, { 0x11BC0UL, 0x11BE1UL }, { 0x11BF0UL, 0x11BF9UL }, { 0x11C00UL, 0x11C08UL }, @@ -540,6 +535,8 @@ static const struct range ranges[] = { { 0x11D90UL, 0x11D91UL }, { 0x11D93UL, 0x11D98UL }, { 0x11DA0UL, 0x11DA9UL }, + { 0x11DB0UL, 0x11DDBUL }, + { 0x11DE0UL, 0x11DE9UL }, { 0x11EE0UL, 0x11EF8UL }, { 0x11F00UL, 0x11F10UL }, { 0x11F12UL, 0x11F3AUL }, @@ -569,16 +566,18 @@ static const struct range ranges[] = { { 0x16B7DUL, 0x16B8FUL }, { 0x16D40UL, 0x16D79UL }, { 0x16E40UL, 0x16E9AUL }, + { 0x16EA0UL, 0x16EB8UL }, + { 0x16EBBUL, 0x16ED3UL }, { 0x16F00UL, 0x16F4AUL }, { 0x16F4FUL, 0x16F87UL }, { 0x16F8FUL, 0x16F9FUL }, { 0x16FE0UL, 0x16FE4UL }, - { 0x16FF0UL, 0x16FF1UL }, + { 0x16FF0UL, 0x16FF6UL }, { 0x17000UL, 0x17000UL }, - { 0x187F7UL, 0x187F7UL }, - { 0x18800UL, 0x18CD5UL }, + { 0x187FFUL, 0x18CD5UL }, { 0x18CFFUL, 0x18D00UL }, - { 0x18D08UL, 0x18D08UL }, + { 0x18D1EUL, 0x18D1EUL }, + { 0x18D80UL, 0x18DF2UL }, { 0x1AFF0UL, 0x1AFF3UL }, { 0x1AFF5UL, 0x1AFFBUL }, { 0x1AFFDUL, 0x1AFFEUL }, @@ -593,8 +592,10 @@ static const struct range ranges[] = { { 0x1BC80UL, 0x1BC88UL }, { 0x1BC90UL, 0x1BC99UL }, { 0x1BC9CUL, 0x1BCA3UL }, - { 0x1CC00UL, 0x1CCF9UL }, + { 0x1CC00UL, 0x1CCFCUL }, { 0x1CD00UL, 0x1CEB3UL }, + { 0x1CEBAUL, 0x1CED0UL }, + { 0x1CEE0UL, 0x1CEF0UL }, { 0x1CF00UL, 0x1CF2DUL }, { 0x1CF30UL, 0x1CF46UL }, { 0x1CF50UL, 0x1CFC3UL }, @@ -648,6 +649,9 @@ static const struct range ranges[] = { { 0x1E4D0UL, 0x1E4F9UL }, { 0x1E5D0UL, 0x1E5FAUL }, { 0x1E5FFUL, 0x1E5FFUL }, + { 0x1E6C0UL, 0x1E6DEUL }, + { 0x1E6E0UL, 0x1E6F5UL }, + { 0x1E6FEUL, 0x1E6FFUL }, { 0x1E7E0UL, 0x1E7E6UL }, { 0x1E7E8UL, 0x1E7EBUL }, { 0x1E7EDUL, 0x1E7EEUL }, @@ -705,11 +709,10 @@ static const struct range ranges[] = { { 0x1F240UL, 0x1F248UL }, { 0x1F250UL, 0x1F251UL }, { 0x1F260UL, 0x1F265UL }, - { 0x1F300UL, 0x1F6D7UL }, + { 0x1F300UL, 0x1F6D8UL }, { 0x1F6DCUL, 0x1F6ECUL }, { 0x1F6F0UL, 0x1F6FCUL }, - { 0x1F700UL, 0x1F776UL }, - { 0x1F77BUL, 0x1F7D9UL }, + { 0x1F700UL, 0x1F7D9UL }, { 0x1F7E0UL, 0x1F7EBUL }, { 0x1F7F0UL, 0x1F7F0UL }, { 0x1F800UL, 0x1F80BUL }, @@ -719,24 +722,25 @@ static const struct range ranges[] = { { 0x1F890UL, 0x1F8ADUL }, { 0x1F8B0UL, 0x1F8BBUL }, { 0x1F8C0UL, 0x1F8C1UL }, - { 0x1F900UL, 0x1FA53UL }, + { 0x1F8D0UL, 0x1F8D8UL }, + { 0x1F900UL, 0x1FA57UL }, { 0x1FA60UL, 0x1FA6DUL }, { 0x1FA70UL, 0x1FA7CUL }, - { 0x1FA80UL, 0x1FA89UL }, - { 0x1FA8FUL, 0x1FAC6UL }, - { 0x1FACEUL, 0x1FADCUL }, - { 0x1FADFUL, 0x1FAE9UL }, - { 0x1FAF0UL, 0x1FAF8UL }, + { 0x1FA80UL, 0x1FA8AUL }, + { 0x1FA8EUL, 0x1FAC6UL }, + { 0x1FAC8UL, 0x1FAC8UL }, + { 0x1FACDUL, 0x1FADCUL }, + { 0x1FADFUL, 0x1FAEAUL }, + { 0x1FAEFUL, 0x1FAF8UL }, { 0x1FB00UL, 0x1FB92UL }, - { 0x1FB94UL, 0x1FBF9UL }, + { 0x1FB94UL, 0x1FBFAUL }, { 0x20000UL, 0x20000UL }, { 0x2A6DFUL, 0x2A6DFUL }, { 0x2A700UL, 0x2A700UL }, - { 0x2B739UL, 0x2B739UL }, - { 0x2B740UL, 0x2B740UL }, + { 0x2B73FUL, 0x2B740UL }, { 0x2B81DUL, 0x2B81DUL }, { 0x2B820UL, 0x2B820UL }, - { 0x2CEA1UL, 0x2CEA1UL }, + { 0x2CEADUL, 0x2CEADUL }, { 0x2CEB0UL, 0x2CEB0UL }, { 0x2EBE0UL, 0x2EBE0UL }, { 0x2EBF0UL, 0x2EBF0UL }, @@ -745,7 +749,8 @@ static const struct range ranges[] = { { 0x30000UL, 0x30000UL }, { 0x3134AUL, 0x3134AUL }, { 0x31350UL, 0x31350UL }, - { 0x323AFUL, 0x323AFUL }, + { 0x323AFUL, 0x323B0UL }, + { 0x33479UL, 0x33479UL }, { 0xE0001UL, 0xE0001UL }, { 0xE0020UL, 0xE007FUL }, { 0xE0100UL, 0xE01EFUL }, diff --git a/src/libre/class_name.c b/src/libre/class_name.c index d96a1d610..e4cb7cc0f 100644 --- a/src/libre/class_name.c +++ b/src/libre/class_name.c @@ -50,6 +50,7 @@ static struct { { &utf8_Bassa_Vah, "Bassa Vah" }, { &utf8_Batak, "Batak" }, { &utf8_Bengali, "Bengali" }, + { &utf8_Beria_Erfe, "Beria Erfe" }, { &utf8_Bhaiksuki, "Bhaiksuki" }, { &utf8_Bopomofo, "Bopomofo" }, { &utf8_Brahmi, "Brahmi" }, @@ -174,6 +175,7 @@ static struct { { &utf8_Sharada, "Sharada" }, { &utf8_Shavian, "Shavian" }, { &utf8_Siddham, "Siddham" }, + { &utf8_Sidetic, "Sidetic" }, { &utf8_SignWriting, "SignWriting" }, { &utf8_Sinhala, "Sinhala" }, { &utf8_Sogdian, "Sogdian" }, @@ -188,6 +190,7 @@ static struct { { &utf8_Tai_Le, "Tai Le" }, { &utf8_Tai_Tham, "Tai Tham" }, { &utf8_Tai_Viet, "Tai Viet" }, + { &utf8_Tai_Yo, "Tai Yo" }, { &utf8_Takri, "Takri" }, { &utf8_Tamil, "Tamil" }, { &utf8_Tangsa, "Tangsa" }, @@ -199,6 +202,7 @@ static struct { { &utf8_Tifinagh, "Tifinagh" }, { &utf8_Tirhuta, "Tirhuta" }, { &utf8_Todhri, "Todhri" }, + { &utf8_Tolong_Siki, "Tolong Siki" }, { &utf8_Toto, "Toto" }, { &utf8_Tulu_Tigalari, "Tulu Tigalari" }, { &utf8_Ugaritic, "Ugaritic" }, diff --git a/src/libre/dialect/glob/lexer.c b/src/libre/dialect/glob/lexer.c index 843cedc4e..2a4d33a29 100644 --- a/src/libre/dialect/glob/lexer.c +++ b/src/libre/dialect/glob/lexer.c @@ -10,11 +10,31 @@ static enum lx_glob_token z0(struct lx_glob_lx *lx); +static int +lx_glob_advance_end(struct lx_glob_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_glob_lx *lx) +lx_glob_getc(struct lx_glob_lx *lx) { int c; @@ -30,18 +50,19 @@ lx_getc(struct lx_glob_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_glob_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_glob_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_glob_getc((struct lx_glob_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -50,10 +71,7 @@ lx_glob_ungetc(struct lx_glob_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -105,6 +123,17 @@ lx_glob_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_glob_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_glob_dynclear(void *buf_opaque) { @@ -144,29 +173,28 @@ lx_glob_dynfree(void *buf_opaque) static enum lx_glob_token z0(struct lx_glob_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '*': state = S2; break; case '?': state = S3; break; @@ -175,34 +203,41 @@ z0(struct lx_glob_lx *lx) break; case S1: /* e.g. "\\x00" */ - lx_glob_ungetc(lx, c); return TOK_CHAR; + lx_glob_ungetc(lx, c); lx_glob_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "*" */ - lx_glob_ungetc(lx, c); return TOK_MANY; + lx_glob_ungetc(lx, c); lx_glob_dynpop(lx->buf_opaque); return TOK_MANY; - case S3: /* e.g. "?" */ - lx_glob_ungetc(lx, c); return TOK_ANY; + case S3: /* e.g. "\077" */ + lx_glob_ungetc(lx, c); lx_glob_dynpop(lx->buf_opaque); return TOK_ANY; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_MANY; + case S3: return TOK_ANY; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_glob_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_MANY; - case S3: return TOK_ANY; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -254,6 +289,7 @@ lx_glob_init(struct lx_glob_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_glob_dynpop; } enum lx_glob_token diff --git a/src/libre/dialect/glob/parser.c b/src/libre/dialect/glob/parser.c index c8f021380..b20798f4f 100644 --- a/src/libre/dialect/glob/parser.c +++ b/src/libre/dialect/glob/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -304,7 +304,7 @@ ZL2_list_Hof_Hatoms:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -322,7 +322,7 @@ ZL2_list_Hof_Hatoms:; goto ZL2_list_Hof_Hatoms; /* END OF INLINE: list-of-atoms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -348,7 +348,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -358,7 +358,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -378,7 +378,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -397,7 +397,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIe) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZIe) == NULL) { @@ -418,7 +418,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -428,7 +428,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIg) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIg) == NULL) { @@ -440,7 +440,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -449,7 +449,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: count-zero-or-more */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); @@ -478,7 +478,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -490,7 +490,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -523,7 +523,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -537,7 +537,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIe) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -558,7 +558,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -575,7 +575,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* END OF INLINE: 119 */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -601,7 +601,7 @@ p_re__glob(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -626,7 +626,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/glob/parser.h b/src/libre/dialect/glob/parser.h index ec618caca..89800e6e9 100644 --- a/src/libre/dialect/glob/parser.h +++ b/src/libre/dialect/glob/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__glob(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/glob/parser.h" diff --git a/src/libre/dialect/like/lexer.c b/src/libre/dialect/like/lexer.c index 4f4dcbdab..2edc365a9 100644 --- a/src/libre/dialect/like/lexer.c +++ b/src/libre/dialect/like/lexer.c @@ -10,11 +10,31 @@ static enum lx_like_token z0(struct lx_like_lx *lx); +static int +lx_like_advance_end(struct lx_like_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_like_lx *lx) +lx_like_getc(struct lx_like_lx *lx) { int c; @@ -30,18 +50,19 @@ lx_getc(struct lx_like_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_like_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_like_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_like_getc((struct lx_like_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -50,10 +71,7 @@ lx_like_ungetc(struct lx_like_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -105,6 +123,17 @@ lx_like_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_like_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_like_dynclear(void *buf_opaque) { @@ -144,29 +173,28 @@ lx_like_dynfree(void *buf_opaque) static enum lx_like_token z0(struct lx_like_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '%': state = S2; break; case '_': state = S3; break; @@ -175,34 +203,41 @@ z0(struct lx_like_lx *lx) break; case S1: /* e.g. "\\x00" */ - lx_like_ungetc(lx, c); return TOK_CHAR; + lx_like_ungetc(lx, c); lx_like_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "%" */ - lx_like_ungetc(lx, c); return TOK_MANY; + lx_like_ungetc(lx, c); lx_like_dynpop(lx->buf_opaque); return TOK_MANY; case S3: /* e.g. "_" */ - lx_like_ungetc(lx, c); return TOK_ANY; + lx_like_ungetc(lx, c); lx_like_dynpop(lx->buf_opaque); return TOK_ANY; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_MANY; + case S3: return TOK_ANY; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_like_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_MANY; - case S3: return TOK_ANY; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -254,6 +289,7 @@ lx_like_init(struct lx_like_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_like_dynpop; } enum lx_like_token diff --git a/src/libre/dialect/like/parser.c b/src/libre/dialect/like/parser.c index 64bfe1078..008bd2b04 100644 --- a/src/libre/dialect/like/parser.c +++ b/src/libre/dialect/like/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -304,7 +304,7 @@ ZL2_list_Hof_Hatoms:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -322,7 +322,7 @@ ZL2_list_Hof_Hatoms:; goto ZL2_list_Hof_Hatoms; /* END OF INLINE: list-of-atoms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -348,7 +348,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -358,7 +358,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -378,7 +378,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -397,7 +397,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIe) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZIe) == NULL) { @@ -418,7 +418,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -428,7 +428,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIg) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIg) == NULL) { @@ -440,7 +440,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -449,7 +449,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, /* END OF ACTION: count-zero-or-more */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); @@ -478,7 +478,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -490,7 +490,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -523,7 +523,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -537,7 +537,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIe) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -558,7 +558,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -575,7 +575,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* END OF INLINE: 119 */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -601,7 +601,7 @@ p_re__like(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -626,7 +626,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/like/parser.h b/src/libre/dialect/like/parser.h index f6c87ad7b..5294f9792 100644 --- a/src/libre/dialect/like/parser.h +++ b/src/libre/dialect/like/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__like(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/like/parser.h" diff --git a/src/libre/dialect/literal/lexer.c b/src/libre/dialect/literal/lexer.c index f4ff77a37..f13bdbc6f 100644 --- a/src/libre/dialect/literal/lexer.c +++ b/src/libre/dialect/literal/lexer.c @@ -10,11 +10,31 @@ static enum lx_literal_token z0(struct lx_literal_lx *lx); +static int +lx_literal_advance_end(struct lx_literal_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_literal_lx *lx) +lx_literal_getc(struct lx_literal_lx *lx) { int c; @@ -30,18 +50,19 @@ lx_getc(struct lx_literal_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_literal_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_literal_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_literal_getc((struct lx_literal_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -50,10 +71,7 @@ lx_literal_ungetc(struct lx_literal_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -105,6 +123,17 @@ lx_literal_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_literal_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_literal_dynclear(void *buf_opaque) { @@ -144,52 +173,58 @@ lx_literal_dynfree(void *buf_opaque) static enum lx_literal_token z0(struct lx_literal_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ state = S1; break; case S1: /* e.g. "" */ - lx_literal_ungetc(lx, c); return TOK_CHAR; + lx_literal_ungetc(lx, c); lx_literal_dynpop(lx->buf_opaque); return TOK_CHAR; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_literal_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -237,6 +272,7 @@ lx_literal_init(struct lx_literal_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_literal_dynpop; } enum lx_literal_token diff --git a/src/libre/dialect/literal/parser.c b/src/libre/dialect/literal/parser.c index 44547716b..5d1dc82f7 100644 --- a/src/libre/dialect/literal/parser.c +++ b/src/libre/dialect/literal/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -304,7 +304,7 @@ ZL2_list_Hof_Hatoms:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -322,7 +322,7 @@ ZL2_list_Hof_Hatoms:; goto ZL2_list_Hof_Hatoms; /* END OF INLINE: list-of-atoms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -349,7 +349,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -363,7 +363,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIe) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -384,7 +384,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -401,7 +401,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ /* END OF INLINE: 117 */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -427,7 +427,7 @@ p_re__literal(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -467,7 +467,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, case (TOK_CHAR): /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -490,7 +490,7 @@ p_list_Hof_Hatoms_C_Catom(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -506,7 +506,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -518,7 +518,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -539,7 +539,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/literal/parser.h b/src/libre/dialect/literal/parser.h index be58db4ea..7f90a15ef 100644 --- a/src/libre/dialect/literal/parser.h +++ b/src/libre/dialect/literal/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__literal(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/literal/parser.h" diff --git a/src/libre/dialect/native/lexer.c b/src/libre/dialect/native/lexer.c index 2399683ac..b18634004 100644 --- a/src/libre/dialect/native/lexer.c +++ b/src/libre/dialect/native/lexer.c @@ -12,11 +12,31 @@ static enum lx_native_token z0(struct lx_native_lx *lx); static enum lx_native_token z1(struct lx_native_lx *lx); static enum lx_native_token z2(struct lx_native_lx *lx); +static int +lx_native_advance_end(struct lx_native_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_native_lx *lx) +lx_native_getc(struct lx_native_lx *lx) { int c; @@ -32,18 +52,19 @@ lx_getc(struct lx_native_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_native_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_native_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_native_getc((struct lx_native_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -52,10 +73,7 @@ lx_native_ungetc(struct lx_native_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -107,6 +125,17 @@ lx_native_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_native_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_native_dynclear(void *buf_opaque) { @@ -146,29 +175,28 @@ lx_native_dynfree(void *buf_opaque) static enum lx_native_token z0(struct lx_native_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '0': case '1': @@ -182,7 +210,9 @@ z0(struct lx_native_lx *lx) case '9': state = S1; break; case ',': state = S2; break; case '}': state = S3; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -198,69 +228,75 @@ z0(struct lx_native_lx *lx) case '7': case '8': case '9': break; - default: lx_native_ungetc(lx, c); return TOK_COUNT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_COUNT; } break; case S2: /* e.g. "," */ - lx_native_ungetc(lx, c); return TOK_SEP; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_SEP; case S3: /* e.g. "}" */ - lx_native_ungetc(lx, c); return lx->z = z2, TOK_CLOSECOUNT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSECOUNT; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_COUNT; + case S2: return TOK_SEP; + case S3: return lx->z = z2, TOK_CLOSECOUNT; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_COUNT; - case S2: return TOK_SEP; - case S3: return TOK_CLOSECOUNT; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_native_token z1(struct lx_native_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, - S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, - S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, - S50, S51, S52, S53, S54, S55, S56, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, + S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, + S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, + S50, S51, S52, S53, S54, S55, S56 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '[': state = S1; break; case '\\': state = S3; break; @@ -273,12 +309,12 @@ z1(struct lx_native_lx *lx) case S1: /* e.g. "[" */ switch ((unsigned char) c) { case ':': state = S12; break; - default: lx_native_ungetc(lx, c); return TOK_CHAR; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; } break; case S2: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_CHAR; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\\" */ switch ((unsigned char) c) { @@ -301,25 +337,27 @@ z1(struct lx_native_lx *lx) case '5': case '6': case '7': state = S9; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S4: /* e.g. "-" */ switch ((unsigned char) c) { case ']': state = S6; break; - default: lx_native_ungetc(lx, c); return TOK_RANGE; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_RANGE; } break; case S5: /* e.g. "]" */ - lx_native_ungetc(lx, c); return lx->z = z2, TOK_CLOSEGROUP; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSEGROUP; case S6: /* e.g. "-]" */ - lx_native_ungetc(lx, c); return lx->z = z2, TOK_CLOSEGROUPRANGE; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSEGROUPRANGE; case S7: /* e.g. "\\-" */ - lx_native_ungetc(lx, c); return TOK_ESC; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ESC; case S8: /* e.g. "\\x" */ switch ((unsigned char) c) { @@ -345,7 +383,9 @@ z1(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S10; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -359,7 +399,7 @@ z1(struct lx_native_lx *lx) case '5': case '6': case '7': break; - default: lx_native_ungetc(lx, c); return TOK_OCT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; } break; @@ -387,12 +427,14 @@ z1(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S11; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S11: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_HEX; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_HEX; case S12: /* e.g. "[:" */ switch ((unsigned char) c) { @@ -408,35 +450,45 @@ z1(struct lx_native_lx *lx) case 'w': state = S21; break; case 'u': state = S22; break; case 'p': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S13: /* e.g. "[:d" */ switch ((unsigned char) c) { case 'i': state = S55; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S14: /* e.g. "[:s" */ switch ((unsigned char) c) { case 'p': state = S52; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:h" */ switch ((unsigned char) c) { case 's': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S16: /* e.g. "[:g" */ switch ((unsigned char) c) { case 'r': state = S49; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -444,42 +496,54 @@ z1(struct lx_native_lx *lx) switch ((unsigned char) c) { case 's': state = S41; break; case 'l': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S18: /* e.g. "[:c" */ switch ((unsigned char) c) { case 'n': state = S38; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S19: /* e.g. "[:l" */ switch ((unsigned char) c) { case 'o': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S20: /* e.g. "[:x" */ switch ((unsigned char) c) { case 'd': state = S13; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S21: /* e.g. "[:w" */ switch ((unsigned char) c) { case 'o': state = S35; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S22: /* e.g. "[:u" */ switch ((unsigned char) c) { case 'p': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -487,129 +551,165 @@ z1(struct lx_native_lx *lx) switch ((unsigned char) c) { case 'r': state = S24; break; case 'u': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S24: /* e.g. "[:pr" */ switch ((unsigned char) c) { case 'i': state = S31; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S25: /* e.g. "[:pu" */ switch ((unsigned char) c) { case 'n': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S26: /* e.g. "[:pun" */ switch ((unsigned char) c) { case 'c': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S27: /* e.g. "[:digi" */ switch ((unsigned char) c) { case 't': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S28: /* e.g. "[:word" */ switch ((unsigned char) c) { case ':': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S29: /* e.g. "[:word:" */ switch ((unsigned char) c) { case ']': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S30: /* e.g. "[:word:]" */ - lx_native_ungetc(lx, c); return TOK_NAMED__CLASS; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; case S31: /* e.g. "[:pri" */ switch ((unsigned char) c) { case 'n': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S32: /* e.g. "[:up" */ switch ((unsigned char) c) { case 'p': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S33: /* e.g. "[:low" */ switch ((unsigned char) c) { case 'e': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S34: /* e.g. "[:lowe" */ switch ((unsigned char) c) { case 'r': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S35: /* e.g. "[:wo" */ switch ((unsigned char) c) { case 'r': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S36: /* e.g. "[:wor" */ switch ((unsigned char) c) { case 'd': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S37: /* e.g. "[:lo" */ switch ((unsigned char) c) { case 'w': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S38: /* e.g. "[:cn" */ switch ((unsigned char) c) { case 't': state = S39; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S39: /* e.g. "[:cnt" */ switch ((unsigned char) c) { case 'r': state = S40; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S40: /* e.g. "[:cntr" */ switch ((unsigned char) c) { case 'l': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S41: /* e.g. "[:as" */ switch ((unsigned char) c) { case 'c': state = S47; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -617,164 +717,200 @@ z1(struct lx_native_lx *lx) switch ((unsigned char) c) { case 'p': state = S43; break; case 'n': state = S44; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S43: /* e.g. "[:alp" */ switch ((unsigned char) c) { case 'h': state = S46; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S44: /* e.g. "[:aln" */ switch ((unsigned char) c) { case 'u': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S45: /* e.g. "[:alnu" */ switch ((unsigned char) c) { case 'm': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S46: /* e.g. "[:alph" */ switch ((unsigned char) c) { case 'a': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S47: /* e.g. "[:asc" */ switch ((unsigned char) c) { case 'i': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S48: /* e.g. "[:asci" */ switch ((unsigned char) c) { case 'i': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S49: /* e.g. "[:gr" */ switch ((unsigned char) c) { case 'a': state = S50; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S50: /* e.g. "[:gra" */ switch ((unsigned char) c) { case 'p': state = S51; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S51: /* e.g. "[:grap" */ switch ((unsigned char) c) { case 'h': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S52: /* e.g. "[:sp" */ switch ((unsigned char) c) { case 'a': state = S53; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S53: /* e.g. "[:spa" */ switch ((unsigned char) c) { case 'c': state = S54; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S54: /* e.g. "[:spac" */ switch ((unsigned char) c) { case 'e': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S55: /* e.g. "[:di" */ switch ((unsigned char) c) { case 'g': state = S56; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S56: /* e.g. "[:dig" */ switch ((unsigned char) c) { case 'i': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; case S4: return TOK_RANGE; - case S5: return TOK_CLOSEGROUP; - case S6: return TOK_CLOSEGROUPRANGE; + case S5: return lx->z = z2, TOK_CLOSEGROUP; + case S6: return lx->z = z2, TOK_CLOSEGROUPRANGE; case S7: return TOK_ESC; case S9: return TOK_OCT; case S11: return TOK_HEX; case S30: return TOK_NAMED__CLASS; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_native_token z2(struct lx_native_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; case '{': state = S3; break; @@ -820,63 +956,63 @@ z2(struct lx_native_lx *lx) case 'v': case '{': case '|': state = S19; break; - default: lx_native_ungetc(lx, c); return TOK_CHAR; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; } break; case S2: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_CHAR; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "{" */ - lx_native_ungetc(lx, c); return lx->z = z0, TOK_OPENCOUNT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z0, TOK_OPENCOUNT; case S4: /* e.g. "[" */ switch ((unsigned char) c) { case '^': state = S14; break; case ']': state = S15; break; - default: lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUP; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUP; } break; case S5: /* e.g. "|" */ - lx_native_ungetc(lx, c); return TOK_ALT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ALT; case S6: /* e.g. "." */ - lx_native_ungetc(lx, c); return TOK_ANY; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ANY; case S7: /* e.g. "+" */ - lx_native_ungetc(lx, c); return TOK_PLUS; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_PLUS; case S8: /* e.g. "*" */ - lx_native_ungetc(lx, c); return TOK_STAR; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_STAR; - case S9: /* e.g. "?" */ - lx_native_ungetc(lx, c); return TOK_OPT; + case S9: /* e.g. "\077" */ + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OPT; case S10: /* e.g. "$" */ - lx_native_ungetc(lx, c); return TOK_END; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_END; case S11: /* e.g. "^" */ - lx_native_ungetc(lx, c); return TOK_START; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_START; case S12: /* e.g. ")" */ - lx_native_ungetc(lx, c); return TOK_CLOSESUB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_CLOSESUB; case S13: /* e.g. "(" */ - lx_native_ungetc(lx, c); return TOK_OPENSUB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OPENSUB; case S14: /* e.g. "[^" */ switch ((unsigned char) c) { case ']': state = S16; break; - default: lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUPINV; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUPINV; } break; case S15: /* e.g. "[]" */ - lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUPCB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUPCB; case S16: /* e.g. "[^]" */ - lx_native_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUPINVCB; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUPINVCB; case S17: /* e.g. "\\x" */ switch ((unsigned char) c) { @@ -902,7 +1038,9 @@ z2(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -916,12 +1054,12 @@ z2(struct lx_native_lx *lx) case '5': case '6': case '7': state = S20; break; - default: lx_native_ungetc(lx, c); return TOK_OCT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; } break; case S19: /* e.g. "\\$" */ - lx_native_ungetc(lx, c); return TOK_ESC; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_ESC; case S20: /* e.g. "\\00" */ switch ((unsigned char) c) { @@ -933,12 +1071,12 @@ z2(struct lx_native_lx *lx) case '5': case '6': case '7': state = S21; break; - default: lx_native_ungetc(lx, c); return TOK_OCT; + default: lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; } break; case S21: /* e.g. "\\000" */ - lx_native_ungetc(lx, c); return TOK_OCT; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_OCT; case S22: /* e.g. "\\x0" */ switch ((unsigned char) c) { @@ -964,32 +1102,26 @@ z2(struct lx_native_lx *lx) case 'd': case 'e': case 'f': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S23: /* e.g. "\\x00" */ - lx_native_ungetc(lx, c); return TOK_HEX; + lx_native_ungetc(lx, c); lx_native_dynpop(lx->buf_opaque); return TOK_HEX; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; - case S3: return TOK_OPENCOUNT; - case S4: return TOK_OPENGROUP; + case S3: return lx->z = z0, TOK_OPENCOUNT; + case S4: return lx->z = z1, TOK_OPENGROUP; case S5: return TOK_ALT; case S6: return TOK_ANY; case S7: return TOK_PLUS; @@ -999,16 +1131,31 @@ z2(struct lx_native_lx *lx) case S11: return TOK_START; case S12: return TOK_CLOSESUB; case S13: return TOK_OPENSUB; - case S14: return TOK_OPENGROUPINV; - case S15: return TOK_OPENGROUPCB; - case S16: return TOK_OPENGROUPINVCB; + case S14: return lx->z = z1, TOK_OPENGROUPINV; + case S15: return lx->z = z1, TOK_OPENGROUPCB; + case S16: return lx->z = z1, TOK_OPENGROUPINVCB; case S18: return TOK_OCT; case S19: return TOK_ESC; case S20: return TOK_OCT; case S21: return TOK_OCT; case S23: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_native_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -1164,6 +1311,7 @@ lx_native_init(struct lx_native_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_native_dynpop; } enum lx_native_token diff --git a/src/libre/dialect/native/parser.c b/src/libre/dialect/native/parser.c index 809383bf8..63c5f8cb8 100644 --- a/src/libre/dialect/native/parser.c +++ b/src/libre/dialect/native/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -326,7 +326,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -349,7 +349,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -385,7 +385,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -443,7 +443,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -499,7 +499,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags /* END OF INLINE: 141 */ /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -531,7 +531,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI216 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -545,7 +545,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI263)); mark(&act_state->countend, &(ZIend)); @@ -555,7 +555,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((*ZIm) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -586,7 +586,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -618,7 +618,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_CLOSECOUNT): /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI219 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -636,7 +636,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI263)); mark(&act_state->countend, &(ZIend)); @@ -646,7 +646,7 @@ p_265(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -696,7 +696,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIclass), (ZInode))) { goto ZL1; @@ -709,7 +709,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; goto ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms; /* END OF INLINE: expr::character-class::list-of-class-terms */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -736,7 +736,7 @@ p_154(flags flags, lex_state lex_state, act_state act_state, err err) case (TOK_RANGE): /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI155 = '-'; ZI156 = lex_state->lx.start; @@ -760,7 +760,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-range */ { -#line 722 "src/libre/parser.act" +#line 718 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXRANGE; @@ -795,7 +795,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -815,7 +815,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; goto ZL2_expr_C_Clist_Hof_Hpieces; /* END OF INLINE: expr::list-of-pieces */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -849,7 +849,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -875,7 +875,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -914,7 +914,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -975,7 +975,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1031,7 +1031,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* END OF INLINE: 109 */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1064,7 +1064,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1096,7 +1096,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1141,7 +1141,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1208,7 +1208,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZI243 = DIALECT_CLASS(lex_state->buf.a); if (ZI243 == NULL) { @@ -1241,7 +1241,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1325,7 +1325,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -1349,7 +1349,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_NAMED; (ZIr).u.named.class = (ZIid); @@ -1389,7 +1389,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUP */ { -#line 319 "src/libre/parser.act" +#line 318 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI167 = lex_state->lx.end; @@ -1403,7 +1403,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1430,7 +1430,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPCB */ { -#line 335 "src/libre/parser.act" +#line 334 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI186 = lex_state->lx.end; @@ -1444,7 +1444,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1457,7 +1457,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -1471,7 +1471,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -1493,7 +1493,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINV */ { -#line 327 "src/libre/parser.act" +#line 326 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI178 = lex_state->lx.end; @@ -1507,7 +1507,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1520,7 +1520,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1577,7 +1577,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINVCB */ { -#line 343 "src/libre/parser.act" +#line 342 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI193 = lex_state->lx.end; @@ -1591,7 +1591,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1604,7 +1604,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1647,7 +1647,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-invert */ /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -1661,7 +1661,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -1693,7 +1693,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI200 = ']'; ZI201 = lex_state->lx.start; @@ -1709,7 +1709,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -1728,7 +1728,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUPRANGE */ { -#line 361 "src/libre/parser.act" +#line 360 "src/libre/parser.act" ZIcrange = '-'; ZI203 = lex_state->lx.start; @@ -1744,7 +1744,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIrange) = ast_make_expr_literal(act_state->poolp, *flags, (ZIcrange)); if ((ZIrange) == NULL) { @@ -1756,7 +1756,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZIrange))) { goto ZL4; @@ -1767,7 +1767,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -1785,7 +1785,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -1821,7 +1821,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZIrstart = lex_state->lx.start; @@ -1842,7 +1842,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode1) == NULL) { @@ -1865,7 +1865,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (ZIc); @@ -1875,7 +1875,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI183 = '-'; ZI184 = lex_state->lx.start; @@ -1896,7 +1896,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp } /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -1935,7 +1935,7 @@ p_180(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF INLINE: 182 */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZItmp), (ZInode1))) { goto ZL1; @@ -1977,7 +1977,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI149 = lex_state->lx.start; @@ -1993,7 +1993,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -2050,7 +2050,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, } /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -2088,7 +2088,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2109,7 +2109,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2129,7 +2129,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -2141,7 +2141,7 @@ ZL1:; /* END OF ACTION: err-expected-alts */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2170,7 +2170,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (*ZIcbrak)); if ((ZInode1) == NULL) { @@ -2194,7 +2194,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (*ZIcbrak); @@ -2204,7 +2204,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI196 = '-'; ZI197 = lex_state->lx.start; @@ -2225,7 +2225,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs } /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZIcbrak); @@ -2235,7 +2235,7 @@ p_195(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2294,7 +2294,7 @@ p_re__native(flags flags, lex_state lex_state, act_state act_state, err err, t_a /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -2308,7 +2308,7 @@ p_re__native(flags flags, lex_state lex_state, act_state act_state, err err, t_a } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -2334,7 +2334,7 @@ p_re__native(flags flags, lex_state lex_state, act_state act_state, err err, t_a { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -2422,7 +2422,7 @@ ZL2_expr_C_Clist_Hof_Halts:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIalts), (ZIa))) { goto ZL1; @@ -2441,7 +2441,7 @@ ZL2_expr_C_Clist_Hof_Halts:; goto ZL2_expr_C_Clist_Hof_Halts; /* END OF INLINE: expr::list-of-alts */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -2453,7 +2453,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -2485,7 +2485,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: OPENCOUNT */ { -#line 371 "src/libre/parser.act" +#line 370 "src/libre/parser.act" ZI263 = lex_state->lx.start; ZI264 = lex_state->lx.end; @@ -2501,7 +2501,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -2541,7 +2541,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-one */ { -#line 817 "src/libre/parser.act" +#line 816 "src/libre/parser.act" (ZIc) = ast_make_count(0, 1); @@ -2555,7 +2555,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-one-or-more */ { -#line 813 "src/libre/parser.act" +#line 812 "src/libre/parser.act" (ZIc) = ast_make_count(1, AST_COUNT_UNBOUNDED); @@ -2569,7 +2569,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -2582,7 +2582,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, { /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -2599,7 +2599,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-count */ { -#line 701 "src/libre/parser.act" +#line 697 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCOUNT; @@ -2611,7 +2611,7 @@ ZL1:; /* END OF ACTION: err-expected-count */ /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -2640,7 +2640,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -2650,7 +2650,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -2667,7 +2667,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-end */ { -#line 943 "src/libre/parser.act" +#line 942 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_END); if ((ZIe) == NULL) { @@ -2687,7 +2687,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -2701,7 +2701,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZIe) = ast_make_expr_group(act_state->poolp, *flags, (ZIg), (ZIid)); if ((ZIe) == NULL) { @@ -2725,7 +2725,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-start */ { -#line 936 "src/libre/parser.act" +#line 935 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_START); if ((ZIe) == NULL) { @@ -2765,7 +2765,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -2777,7 +2777,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -2806,7 +2806,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla { /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (*ZI243)); if ((ZInode) == NULL) { @@ -2826,7 +2826,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_NAMED; (ZIlower).u.named.class = (*ZI243); @@ -2842,7 +2842,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI244)); mark(&act_state->rangeend, &(ZIend)); @@ -2852,7 +2852,7 @@ p_246(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2908,7 +2908,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZInode) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2942,7 +2942,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (*ZI247)); if ((ZInode) == NULL) { @@ -2962,7 +2962,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZI247); @@ -2978,7 +2978,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI248)); mark(&act_state->rangeend, &(ZIend)); @@ -2988,7 +2988,7 @@ p_250(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -3035,7 +3035,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/native/parser.h b/src/libre/dialect/native/parser.h index e19648892..5cf04f6c6 100644 --- a/src/libre/dialect/native/parser.h +++ b/src/libre/dialect/native/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__native(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/native/parser.h" diff --git a/src/libre/dialect/pcre/lexer.c b/src/libre/dialect/pcre/lexer.c index b26096785..da8f825ee 100644 --- a/src/libre/dialect/pcre/lexer.c +++ b/src/libre/dialect/pcre/lexer.c @@ -17,11 +17,31 @@ static enum lx_pcre_token z5(struct lx_pcre_lx *lx); static enum lx_pcre_token z6(struct lx_pcre_lx *lx); static enum lx_pcre_token z7(struct lx_pcre_lx *lx); +static int +lx_pcre_advance_end(struct lx_pcre_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_pcre_lx *lx) +lx_pcre_getc(struct lx_pcre_lx *lx) { int c; @@ -37,18 +57,19 @@ lx_getc(struct lx_pcre_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_pcre_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_pcre_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_pcre_getc((struct lx_pcre_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -57,10 +78,7 @@ lx_pcre_ungetc(struct lx_pcre_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -112,6 +130,17 @@ lx_pcre_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_pcre_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_pcre_dynclear(void *buf_opaque) { @@ -151,32 +180,33 @@ lx_pcre_dynfree(void *buf_opaque) static enum lx_pcre_token z0(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S2; break; } break; @@ -184,19 +214,30 @@ z0(struct lx_pcre_lx *lx) case S1: /* e.g. "\\" */ switch ((unsigned char) c) { case 'E': state = S3; break; - default: lx_pcre_ungetc(lx, c); return TOK_CHAR; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S2: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S2: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_CHAR; + case S3: return lx->z = z7, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S3: @@ -211,45 +252,40 @@ z0(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_CHAR; - case S3: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z1(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '0': case '1': @@ -263,7 +299,9 @@ z1(struct lx_pcre_lx *lx) case '9': state = S1; break; case ',': state = S2; break; case '}': state = S3; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -279,67 +317,75 @@ z1(struct lx_pcre_lx *lx) case '7': case '8': case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_COUNT; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_COUNT; } break; case S2: /* e.g. "," */ - lx_pcre_ungetc(lx, c); return TOK_SEP; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_SEP; case S3: /* e.g. "}" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSECOUNT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSECOUNT; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_COUNT; + case S2: return TOK_SEP; + case S3: return lx->z = z7, TOK_CLOSECOUNT; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_COUNT; - case S2: return TOK_SEP; - case S3: return TOK_CLOSECOUNT; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z2(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S2; break; } break; @@ -347,19 +393,30 @@ z2(struct lx_pcre_lx *lx) case S1: /* e.g. "\\" */ switch ((unsigned char) c) { case 'E': state = S3; break; - default: lx_pcre_ungetc(lx, c); return TOK_CHAR; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S2: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S2: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z = z3, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_CHAR; + case S3: return lx->z = z3, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S3: @@ -374,24 +431,30 @@ z2(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_CHAR; - case S3: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z3(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; + assert(lx != NULL); + + if (lx->clear != NULL) { + lx->clear(lx->buf_opaque); + } + + lx->start = lx->end; + + void *getc_opaque = (void *)lx; enum { S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, @@ -400,32 +463,23 @@ z3(struct lx_pcre_lx *lx) S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63, S64, S65, S66, S67, S68, S69, - S70, S71, S72, NONE + S70, S71, S72 } state; - assert(lx != NULL); - - if (lx->clear != NULL) { - lx->clear(lx->buf_opaque); - } - - state = NONE; - - lx->start = lx->end; - - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S1; break; case '[': state = S2; break; case '-': state = S4; break; case ']': state = S5; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S3; break; } break; @@ -441,22 +495,8 @@ z3(struct lx_pcre_lx *lx) case 'h': case 's': case 'v': - case 'w': state = S24; break; - case 'Q': state = S55; break; - case 'E': state = S56; break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': state = S57; break; - case 'x': state = S58; break; - case '0': state = S59; break; - case 'o': state = S60; break; - case 'c': state = S61; break; + case 'w': state = S23; break; + case 'c': state = S55; break; case '$': case '(': case '*': @@ -475,384 +515,492 @@ z3(struct lx_pcre_lx *lx) case 'r': case 't': case '{': - case '|': state = S63; break; - default: state = S62; break; + case '|': state = S56; break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': state = S57; break; + case 'Q': state = S58; break; + case 'E': state = S59; break; + case 'o': state = S60; break; + case 'x': state = S62; break; + case '0': state = S63; break; + default: state = S61; break; } break; case S2: /* e.g. "[" */ switch ((unsigned char) c) { case ':': state = S7; break; - default: lx_pcre_ungetc(lx, c); return TOK_CHAR; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S3: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S3: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S4: /* e.g. "-" */ switch ((unsigned char) c) { case ']': state = S6; break; - default: lx_pcre_ungetc(lx, c); return TOK_RANGE; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_RANGE; } break; case S5: /* e.g. "]" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSEGROUP; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSEGROUP; case S6: /* e.g. "-]" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSEGROUPRANGE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSEGROUPRANGE; case S7: /* e.g. "[:" */ switch ((unsigned char) c) { case 'd': state = S8; break; - case 'u': state = S9; break; - case 'w': state = S10; break; - case 'x': state = S11; break; - case 'b': state = S12; break; - case 'c': state = S13; break; - case 'l': state = S14; break; + case 'p': state = S9; break; + case 'x': state = S10; break; + case 'c': state = S11; break; + case 'l': state = S12; break; + case 'g': state = S13; break; + case 's': state = S14; break; case 'a': state = S15; break; - case 's': state = S16; break; - case 'p': state = S17; break; - case 'g': state = S18; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'b': state = S16; break; + case 'u': state = S17; break; + case 'w': state = S18; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S8: /* e.g. "[:d" */ switch ((unsigned char) c) { case 'i': state = S53; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S9: /* e.g. "[:u" */ + case S9: /* e.g. "[:p" */ switch ((unsigned char) c) { - case 'p': state = S52; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S48; break; + case 'u': state = S49; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S10: /* e.g. "[:w" */ + case S10: /* e.g. "[:x" */ switch ((unsigned char) c) { - case 'o': state = S50; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'd': state = S8; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S11: /* e.g. "[:x" */ + case S11: /* e.g. "[:c" */ switch ((unsigned char) c) { - case 'd': state = S8; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S45; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S12: /* e.g. "[:b" */ + case S12: /* e.g. "[:l" */ switch ((unsigned char) c) { - case 'l': state = S47; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'o': state = S44; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S13: /* e.g. "[:c" */ + case S13: /* e.g. "[:g" */ switch ((unsigned char) c) { - case 'n': state = S44; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S41; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S14: /* e.g. "[:l" */ + case S14: /* e.g. "[:s" */ switch ((unsigned char) c) { - case 'o': state = S41; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S38; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:a" */ switch ((unsigned char) c) { - case 's': state = S33; break; - case 'l': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 's': state = S30; break; + case 'l': state = S31; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S16: /* e.g. "[:s" */ + case S16: /* e.g. "[:b" */ switch ((unsigned char) c) { - case 'p': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'l': state = S27; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S17: /* e.g. "[:p" */ + case S17: /* e.g. "[:u" */ switch ((unsigned char) c) { - case 'r': state = S25; break; - case 'u': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S24; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S18: /* e.g. "[:g" */ + case S18: /* e.g. "[:w" */ switch ((unsigned char) c) { - case 'r': state = S19; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'o': state = S19; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S19: /* e.g. "[:gr" */ + case S19: /* e.g. "[:wo" */ switch ((unsigned char) c) { - case 'a': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S20; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S20: /* e.g. "[:gra" */ + case S20: /* e.g. "[:wor" */ switch ((unsigned char) c) { - case 'p': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'd': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S21: /* e.g. "[:grap" */ + case S21: /* e.g. "[:word" */ switch ((unsigned char) c) { - case 'h': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case ':': state = S22; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S22: /* e.g. "[:word" */ + case S22: /* e.g. "[:word:" */ switch ((unsigned char) c) { - case ':': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case ']': state = S23; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S23: /* e.g. "[:word:" */ + case S23: /* e.g. "\\D" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; + + case S24: /* e.g. "[:up" */ switch ((unsigned char) c) { - case ']': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S25; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S24: /* e.g. "\\d" */ - lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; - - case S25: /* e.g. "[:pr" */ + case S25: /* e.g. "[:low" */ switch ((unsigned char) c) { - case 'i': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'e': state = S26; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S26: /* e.g. "[:pu" */ + case S26: /* e.g. "[:lowe" */ switch ((unsigned char) c) { - case 'n': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S27: /* e.g. "[:pun" */ + case S27: /* e.g. "[:bl" */ switch ((unsigned char) c) { - case 'c': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S28; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S28: /* e.g. "[:digi" */ + case S28: /* e.g. "[:bla" */ switch ((unsigned char) c) { - case 't': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S29; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S29: /* e.g. "[:pri" */ + case S29: /* e.g. "[:blan" */ switch ((unsigned char) c) { - case 'n': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'k': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S30: /* e.g. "[:sp" */ + case S30: /* e.g. "[:as" */ switch ((unsigned char) c) { - case 'a': state = S31; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'c': state = S36; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S31: /* e.g. "[:spa" */ + case S31: /* e.g. "[:al" */ switch ((unsigned char) c) { - case 'c': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S32; break; + case 'n': state = S33; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S32: /* e.g. "[:spac" */ + case S32: /* e.g. "[:alp" */ switch ((unsigned char) c) { - case 'e': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'h': state = S35; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S33: /* e.g. "[:as" */ + case S33: /* e.g. "[:aln" */ switch ((unsigned char) c) { - case 'c': state = S39; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'u': state = S34; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S34: /* e.g. "[:al" */ + case S34: /* e.g. "[:alnu" */ switch ((unsigned char) c) { - case 'n': state = S35; break; - case 'p': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'm': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S35: /* e.g. "[:aln" */ + case S35: /* e.g. "[:alph" */ switch ((unsigned char) c) { - case 'u': state = S38; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S36: /* e.g. "[:alp" */ + case S36: /* e.g. "[:asc" */ switch ((unsigned char) c) { - case 'h': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S37; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S37: /* e.g. "[:alph" */ + case S37: /* e.g. "[:asci" */ switch ((unsigned char) c) { - case 'a': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S38: /* e.g. "[:alnu" */ + case S38: /* e.g. "[:sp" */ switch ((unsigned char) c) { - case 'm': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S39; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S39: /* e.g. "[:asc" */ + case S39: /* e.g. "[:spa" */ switch ((unsigned char) c) { - case 'i': state = S40; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'c': state = S40; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S40: /* e.g. "[:asci" */ + case S40: /* e.g. "[:spac" */ switch ((unsigned char) c) { - case 'i': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'e': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S41: /* e.g. "[:lo" */ + case S41: /* e.g. "[:gr" */ switch ((unsigned char) c) { - case 'w': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'a': state = S42; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S42: /* e.g. "[:low" */ + case S42: /* e.g. "[:gra" */ switch ((unsigned char) c) { - case 'e': state = S43; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'p': state = S43; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S43: /* e.g. "[:lowe" */ + case S43: /* e.g. "[:grap" */ switch ((unsigned char) c) { - case 'r': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'h': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S44: /* e.g. "[:cn" */ + case S44: /* e.g. "[:lo" */ switch ((unsigned char) c) { - case 't': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'w': state = S25; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S45: /* e.g. "[:cnt" */ + case S45: /* e.g. "[:cn" */ switch ((unsigned char) c) { - case 'r': state = S46; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 't': state = S46; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S46: /* e.g. "[:cntr" */ + case S46: /* e.g. "[:cnt" */ switch ((unsigned char) c) { - case 'l': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'r': state = S47; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S47: /* e.g. "[:bl" */ + case S47: /* e.g. "[:cntr" */ switch ((unsigned char) c) { - case 'a': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'l': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S48: /* e.g. "[:bla" */ + case S48: /* e.g. "[:pr" */ switch ((unsigned char) c) { - case 'n': state = S49; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S52; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S49: /* e.g. "[:blan" */ + case S49: /* e.g. "[:pu" */ switch ((unsigned char) c) { - case 'k': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S50; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S50: /* e.g. "[:wo" */ + case S50: /* e.g. "[:pun" */ switch ((unsigned char) c) { - case 'r': state = S51; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'c': state = S51; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S51: /* e.g. "[:wor" */ + case S51: /* e.g. "[:digi" */ switch ((unsigned char) c) { - case 'd': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 't': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S52: /* e.g. "[:up" */ + case S52: /* e.g. "[:pri" */ switch ((unsigned char) c) { - case 'p': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'n': state = S51; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S53: /* e.g. "[:di" */ switch ((unsigned char) c) { case 'g': state = S54; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S54: /* e.g. "[:dig" */ switch ((unsigned char) c) { - case 'i': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'i': state = S51; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S55: /* e.g. "\\Q" */ - lx_pcre_ungetc(lx, c); return lx->z = z2, lx->z(lx); + case S55: /* e.g. "\\c" */ + state = S72; break; - case S56: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z(lx); + case S56: /* e.g. "\\$" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ESC; case S57: /* e.g. "\\1" */ switch ((unsigned char) c) { @@ -866,13 +1014,29 @@ z3(struct lx_pcre_lx *lx) case '7': case '8': case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; - case S58: /* e.g. "\\x" */ + case S58: /* e.g. "\\Q" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z2, lx->z(lx); + + case S59: /* e.g. "\\E" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z(lx); + + case S60: /* e.g. "\\o" */ switch ((unsigned char) c) { - case '{': state = S69; break; + case '{': state = S70; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + } + break; + + case S61: /* e.g. "\\\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + + case S62: /* e.g. "\\x" */ + switch ((unsigned char) c) { + case '{': state = S66; break; case '0': case '1': case '2': @@ -894,12 +1058,12 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S70; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + case 'f': state = S67; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S59: /* e.g. "\\0" */ + case S63: /* e.g. "\\0" */ switch ((unsigned char) c) { case '0': case '1': @@ -908,31 +1072,12 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S68; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; + case '7': state = S64; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S60: /* e.g. "\\o" */ - switch ((unsigned char) c) { - case '{': state = S65; break; - default: lx_pcre_ungetc(lx, c); return TOK_NOESC; - } - break; - - case S61: /* e.g. "\\c" */ - state = S64; break; - - case S62: /* e.g. "\\g" */ - lx_pcre_ungetc(lx, c); return TOK_NOESC; - - case S63: /* e.g. "\\a" */ - lx_pcre_ungetc(lx, c); return TOK_ESC; - - case S64: /* e.g. "\\ca" */ - lx_pcre_ungetc(lx, c); return TOK_CONTROL; - - case S65: /* e.g. "\\o{" */ + case S64: /* e.g. "\\00" */ switch ((unsigned char) c) { case '0': case '1': @@ -941,44 +1086,15 @@ z3(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S66; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case '7': state = S65; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S66: /* e.g. "\\o{0" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': break; - case '}': state = S67; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; + case S65: /* e.g. "\\000" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; - case S67: /* e.g. "\\000" */ - lx_pcre_ungetc(lx, c); return TOK_OCT; - - case S68: /* e.g. "\\00" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': state = S67; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; - } - break; - - case S69: /* e.g. "\\x{" */ + case S66: /* e.g. "\\x{" */ switch ((unsigned char) c) { case '0': case '1': @@ -1001,12 +1117,14 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S72; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case 'f': state = S69; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S70: /* e.g. "\\xa" */ + case S67: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -1029,17 +1147,17 @@ z3(struct lx_pcre_lx *lx) case 'c': case 'd': case 'e': - case 'f': state = S71; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + case 'f': state = S68; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S71: /* e.g. "\\xaa" */ - lx_pcre_ungetc(lx, c); return TOK_HEX; + case S68: /* e.g. "\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; - case S72: /* e.g. "\\x{a" */ + case S69: /* e.g. "\\x{0" */ switch ((unsigned char) c) { - case '}': state = S71; break; + case '}': state = S68; break; case '0': case '1': case '2': @@ -1062,17 +1180,84 @@ z3(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S70: /* e.g. "\\o{" */ + switch ((unsigned char) c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': state = S71; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; + case S71: /* e.g. "\\o{0" */ + switch ((unsigned char) c) { + case '}': state = S65; break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S72: /* e.g. "\\c\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CONTROL; + default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_INVALID; + case S2: return TOK_CHAR; + case S3: return TOK_CHAR; + case S4: return TOK_RANGE; + case S5: return lx->z = z7, TOK_CLOSEGROUP; + case S6: return lx->z = z7, TOK_CLOSEGROUPRANGE; + case S23: return TOK_NAMED__CLASS; + case S55: return TOK_NOESC; + case S56: return TOK_ESC; + case S57: return TOK_UNSUPPORTED; + case S58: return lx->z = z2, lx->z(lx); + case S59: return TOK_EOF; + case S60: return TOK_NOESC; + case S61: return TOK_NOESC; + case S62: return TOK_HEX; + case S63: return TOK_OCT; + case S64: return TOK_OCT; + case S65: return TOK_OCT; + case S67: return TOK_HEX; + case S68: return TOK_HEX; + case S72: return TOK_CONTROL; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { - case S55: - case S56: + case S58: + case S59: break; default: @@ -1084,64 +1269,41 @@ z3(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_INVALID; - case S2: return TOK_CHAR; - case S3: return TOK_CHAR; - case S4: return TOK_RANGE; - case S5: return TOK_CLOSEGROUP; - case S6: return TOK_CLOSEGROUPRANGE; - case S24: return TOK_NAMED__CLASS; - case S55: return TOK_EOF; - case S56: return TOK_EOF; - case S57: return TOK_UNSUPPORTED; - case S58: return TOK_HEX; - case S59: return TOK_OCT; - case S60: return TOK_NOESC; - case S61: return TOK_NOESC; - case S62: return TOK_NOESC; - case S63: return TOK_ESC; - case S64: return TOK_CONTROL; - case S67: return TOK_OCT; - case S68: return TOK_OCT; - case S70: return TOK_HEX; - case S71: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z4(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '+': case 'R': state = S1; break; @@ -1184,12 +1346,14 @@ z4(struct lx_pcre_lx *lx) case '-': state = S8; break; case ')': state = S9; break; case ':': state = S10; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S1: /* e.g. "R" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + case S1: /* e.g. "+" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; case S2: /* e.g. "0" */ switch ((unsigned char) c) { @@ -1203,53 +1367,45 @@ z4(struct lx_pcre_lx *lx) case '7': case '8': case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S3: /* e.g. "n" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__IGNORE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__IGNORE; case S4: /* e.g. "x" */ switch ((unsigned char) c) { case 'x': state = S7; break; - default: lx_pcre_ungetc(lx, c); return TOK_FLAG__EXTENDED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__EXTENDED; } break; case S5: /* e.g. "s" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__SINGLE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__SINGLE; case S6: /* e.g. "i" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__INSENSITIVE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__INSENSITIVE; case S7: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_FLAG__UNKNOWN; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_FLAG__UNKNOWN; case S8: /* e.g. "-" */ - lx_pcre_ungetc(lx, c); return TOK_NEGATE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NEGATE; case S9: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_CLOSE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_CLOSE; case S10: /* e.g. ":" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, TOK_SUB; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, TOK_SUB; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_UNSUPPORTED; case S2: return TOK_UNSUPPORTED; case S3: return TOK_FLAG__IGNORE; @@ -1258,38 +1414,52 @@ z4(struct lx_pcre_lx *lx) case S6: return TOK_FLAG__INSENSITIVE; case S7: return TOK_FLAG__UNKNOWN; case S8: return TOK_NEGATE; - case S9: return TOK_CLOSE; - case S10: return TOK_SUB; - default: errno = EINVAL; return TOK_ERROR; + case S9: return lx->z = z7, TOK_CLOSE; + case S10: return lx->z = z7, TOK_SUB; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z5(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '(': state = S2; break; case ')': state = S3; break; @@ -1297,23 +1467,34 @@ z5(struct lx_pcre_lx *lx) } break; - case S1: /* e.g. "a" */ + case S1: /* e.g. "\\x00" */ switch ((unsigned char) c) { case '(': - case ')': lx_pcre_ungetc(lx, c); return lx->z(lx); + case ')': lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z(lx); default: break; } break; case S2: /* e.g. "(" */ - lx_pcre_ungetc(lx, c); return TOK_INVALID__COMMENT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_INVALID__COMMENT; case S3: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return TOK_INVALID__COMMENT; + case S3: return lx->z = z7, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S1: @@ -1329,24 +1510,30 @@ z5(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_INVALID__COMMENT; - case S3: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z6(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; + assert(lx != NULL); + + if (lx->clear != NULL) { + lx->clear(lx->buf_opaque); + } + + lx->start = lx->end; + + void *getc_opaque = (void *)lx; enum { S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, @@ -1355,26 +1542,15 @@ z6(struct lx_pcre_lx *lx) S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63, S64, S65, S66, S67, S68, S69, - S70, S71, S72, S73, S74, S75, S76, S77, S78, NONE + S70, S71, S72, S73, S74, S75, S76, S77, S78 } state; - assert(lx != NULL); - - if (lx->clear != NULL) { - lx->clear(lx->buf_opaque); - } - - state = NONE; - - lx->start = lx->end; - - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case ':': state = S1; break; case 'L': state = S2; break; @@ -1390,13 +1566,17 @@ z6(struct lx_pcre_lx *lx) case 'n': state = S12; break; case 'F': state = S13; break; case ')': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S1: /* e.g. ":" */ switch ((unsigned char) c) { - case ')': lx->lgetc = NULL; return TOK_UNKNOWN; + case ')': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S78; break; } break; @@ -1404,7 +1584,9 @@ z6(struct lx_pcre_lx *lx) case S2: /* e.g. "L" */ switch ((unsigned char) c) { case 'F': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1412,35 +1594,45 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'l': state = S18; break; case 'o': state = S76; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S4: /* e.g. "M" */ switch ((unsigned char) c) { case 'A': state = S74; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S5: /* e.g. "T" */ switch ((unsigned char) c) { case 'H': state = S72; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S6: /* e.g. "S" */ switch ((unsigned char) c) { case 'K': state = S70; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S7: /* e.g. "P" */ switch ((unsigned char) c) { case 'R': state = S67; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1448,21 +1640,27 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'C': state = S60; break; case 'N': state = S61; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S9: /* e.g. "a" */ switch ((unsigned char) c) { case 't': state = S56; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S10: /* e.g. "N" */ switch ((unsigned char) c) { case 'O': state = S46; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1470,7 +1668,9 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'R': state = S41; break; case 'O': state = S42; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1478,7 +1678,9 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'l': state = S18; break; case 'e': state = S19; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1486,31 +1688,35 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case ':': state = S1; break; case 'A': state = S15; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S14: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return lx->z = z7, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z7, lx->z(lx); case S15: /* e.g. "FA" */ switch ((unsigned char) c) { case 'I': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S16: /* e.g. "FAI" */ switch ((unsigned char) c) { case 'L': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S17: /* e.g. "FAIL" */ switch ((unsigned char) c) { case ':': state = S1; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; @@ -1518,84 +1724,108 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'a': case 'b': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S19: /* e.g. "ne" */ switch ((unsigned char) c) { case 'g': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S20: /* e.g. "neg" */ switch ((unsigned char) c) { case 'a': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S21: /* e.g. "nega" */ switch ((unsigned char) c) { case 't': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S22: /* e.g. "negat" */ switch ((unsigned char) c) { case 'i': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S23: /* e.g. "negati" */ switch ((unsigned char) c) { case 'v': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S24: /* e.g. "negativ" */ switch ((unsigned char) c) { case 'e': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S25: /* e.g. "negative" */ switch ((unsigned char) c) { case '_': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S26: /* e.g. "negative_" */ switch ((unsigned char) c) { case 'l': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S27: /* e.g. "negative_l" */ switch ((unsigned char) c) { case 'o': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S28: /* e.g. "negative_lo" */ switch ((unsigned char) c) { case 'o': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S29: /* e.g. "negative_loo" */ switch ((unsigned char) c) { case 'k': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -1603,338 +1833,428 @@ z6(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case 'b': state = S31; break; case 'a': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S31: /* e.g. "negative_lookb" */ switch ((unsigned char) c) { case 'e': state = S38; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S32: /* e.g. "negative_looka" */ switch ((unsigned char) c) { case 'h': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S33: /* e.g. "negative_lookah" */ switch ((unsigned char) c) { case 'e': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S34: /* e.g. "negative_lookahe" */ switch ((unsigned char) c) { case 'a': state = S35; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S35: /* e.g. "negative_lookahea" */ switch ((unsigned char) c) { case 'd': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S36: /* e.g. "nla" */ switch ((unsigned char) c) { case ':': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S37: /* e.g. "LF" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; case S38: /* e.g. "negative_lookbe" */ switch ((unsigned char) c) { case 'h': state = S39; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S39: /* e.g. "negative_lookbeh" */ switch ((unsigned char) c) { case 'i': state = S40; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S40: /* e.g. "negative_lookbehi" */ switch ((unsigned char) c) { case 'n': state = S35; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S41: /* e.g. "CR" */ switch ((unsigned char) c) { case 'L': state = S2; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S42: /* e.g. "CO" */ switch ((unsigned char) c) { case 'M': state = S43; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S43: /* e.g. "COM" */ switch ((unsigned char) c) { case 'M': state = S44; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S44: /* e.g. "COMM" */ switch ((unsigned char) c) { case 'I': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S45: /* e.g. "ACCEP" */ switch ((unsigned char) c) { case 'T': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S46: /* e.g. "NO" */ switch ((unsigned char) c) { case '_': state = S47; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S47: /* e.g. "NO_" */ switch ((unsigned char) c) { case 'S': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S48: /* e.g. "NO_S" */ switch ((unsigned char) c) { case 'T': state = S49; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S49: /* e.g. "NO_ST" */ switch ((unsigned char) c) { case 'A': state = S50; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S50: /* e.g. "NO_STA" */ switch ((unsigned char) c) { case 'R': state = S51; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S51: /* e.g. "NO_STAR" */ switch ((unsigned char) c) { case 'T': state = S52; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S52: /* e.g. "NO_START" */ switch ((unsigned char) c) { case '_': state = S53; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S53: /* e.g. "NO_START_" */ switch ((unsigned char) c) { case 'O': state = S54; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S54: /* e.g. "NO_START_O" */ switch ((unsigned char) c) { case 'P': state = S55; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S55: /* e.g. "NO_START_OP" */ switch ((unsigned char) c) { case 'T': state = S37; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S56: /* e.g. "at" */ switch ((unsigned char) c) { case 'o': state = S57; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S57: /* e.g. "ato" */ switch ((unsigned char) c) { case 'm': state = S58; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S58: /* e.g. "atom" */ switch ((unsigned char) c) { case 'i': state = S59; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S59: /* e.g. "atomi" */ switch ((unsigned char) c) { case 'c': state = S36; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S60: /* e.g. "AC" */ switch ((unsigned char) c) { case 'C': state = S65; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S61: /* e.g. "AN" */ switch ((unsigned char) c) { case 'Y': state = S62; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S62: /* e.g. "ANY" */ switch ((unsigned char) c) { case 'C': state = S63; break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; case S63: /* e.g. "ANYC" */ switch ((unsigned char) c) { case 'R': state = S64; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S64: /* e.g. "ANYCR" */ switch ((unsigned char) c) { case 'L': state = S2; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S65: /* e.g. "ACC" */ switch ((unsigned char) c) { case 'E': state = S66; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S66: /* e.g. "ACCE" */ switch ((unsigned char) c) { case 'P': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S67: /* e.g. "PR" */ switch ((unsigned char) c) { case 'U': state = S68; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S68: /* e.g. "PRU" */ switch ((unsigned char) c) { case 'N': state = S69; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S69: /* e.g. "PRUN" */ switch ((unsigned char) c) { case 'E': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S70: /* e.g. "SK" */ switch ((unsigned char) c) { case 'I': state = S71; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S71: /* e.g. "SKI" */ switch ((unsigned char) c) { case 'P': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S72: /* e.g. "TH" */ switch ((unsigned char) c) { case 'E': state = S73; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S73: /* e.g. "THE" */ switch ((unsigned char) c) { case 'N': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S74: /* e.g. "MA" */ switch ((unsigned char) c) { case 'R': state = S75; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S75: /* e.g. "MAR" */ switch ((unsigned char) c) { case 'K': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S76: /* e.g. "po" */ switch ((unsigned char) c) { case 's': state = S77; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S77: /* e.g. "pos" */ switch ((unsigned char) c) { case 'i': state = S21; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S78: /* e.g. ":a" */ + case S78: /* e.g. ":\\x00" */ switch ((unsigned char) c) { - case ')': lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + case ')': lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; default: break; } break; @@ -1942,6 +2262,21 @@ z6(struct lx_pcre_lx *lx) default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S13: return TOK_UNSUPPORTED; + case S14: return lx->z = z7, lx->z(lx); + case S17: return TOK_UNSUPPORTED; + case S37: return TOK_UNSUPPORTED; + case S41: return TOK_UNSUPPORTED; + case S62: return TOK_UNSUPPORTED; + case S78: return TOK_UNSUPPORTED; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S14: @@ -1956,53 +2291,44 @@ z6(struct lx_pcre_lx *lx) break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S13: return TOK_UNSUPPORTED; - case S14: return TOK_EOF; - case S17: return TOK_UNSUPPORTED; - case S37: return TOK_UNSUPPORTED; - case S41: return TOK_UNSUPPORTED; - case S62: return TOK_UNSUPPORTED; - case S78: return TOK_UNSUPPORTED; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_pcre_token z7(struct lx_pcre_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, - S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, - S40, S41, S42, S43, S44, S45, S46, S47, S48, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, + S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, + S40, S41, S42, S43, S44, S45, S46, S47, S48 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\\': state = S2; break; case '\n': @@ -2023,13 +2349,15 @@ z7(struct lx_pcre_lx *lx) case '$': state = S14; break; case '^': state = S15; break; case ')': state = S16; break; - case '\x00': lx->lgetc = NULL; return TOK_UNKNOWN; + case '\x00': + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_pcre_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "\\x01" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "\\" */ switch ((unsigned char) c) { @@ -2042,24 +2370,7 @@ z7(struct lx_pcre_lx *lx) case 'X': case 'b': case 'g': - case 'k': state = S20; break; - case 'Q': state = S28; break; - case 'o': state = S29; break; - case 'c': state = S30; break; - case 'x': state = S32; break; - case '0': state = S33; break; - case 'R': state = S34; break; - case 'D': - case 'H': - case 'N': - case 'S': - case 'V': - case 'W': - case 'd': - case 'h': - case 's': - case 'v': - case 'w': state = S35; break; + case 'k': state = S21; break; case '$': case '(': case ')': @@ -2077,8 +2388,9 @@ z7(struct lx_pcre_lx *lx) case 'r': case 't': case '{': - case '|': state = S36; break; - case 'E': state = S37; break; + case '|': state = S28; break; + case 'E': state = S29; break; + case 'z': state = S30; break; case '1': case '2': case '3': @@ -2087,29 +2399,45 @@ z7(struct lx_pcre_lx *lx) case '6': case '7': case '8': - case '9': state = S38; break; - case 'z': state = S39; break; - default: state = S31; break; + case '9': state = S31; break; + case 'Q': state = S32; break; + case 'c': state = S33; break; + case 'o': state = S34; break; + case 'x': state = S36; break; + case 'D': + case 'H': + case 'N': + case 'S': + case 'V': + case 'W': + case 'd': + case 'h': + case 's': + case 'v': + case 'w': state = S37; break; + case 'R': state = S38; break; + case '0': state = S39; break; + default: state = S35; break; } break; case S3: /* e.g. "\\x0a" */ - lx_pcre_ungetc(lx, c); return TOK_NEWLINE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NEWLINE; case S4: /* e.g. "\\x09" */ - lx_pcre_ungetc(lx, c); return TOK_WHITESPACE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_WHITESPACE; case S5: /* e.g. "#" */ - lx_pcre_ungetc(lx, c); return TOK_MAYBE_COMMENT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_MAYBE_COMMENT; case S6: /* e.g. "{" */ - lx_pcre_ungetc(lx, c); return lx->z = z1, TOK_OPENCOUNT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENCOUNT; case S7: /* e.g. "[" */ switch ((unsigned char) c) { case '^': state = S25; break; case ']': state = S26; break; - default: lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUP; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUP; } break; @@ -2117,66 +2445,56 @@ z7(struct lx_pcre_lx *lx) switch ((unsigned char) c) { case '?': state = S17; break; case '*': state = S18; break; - default: lx_pcre_ungetc(lx, c); return TOK_OPEN; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OPEN; } break; case S9: /* e.g. "|" */ - lx_pcre_ungetc(lx, c); return TOK_ALT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ALT; case S10: /* e.g. "." */ - lx_pcre_ungetc(lx, c); return TOK_ANY; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ANY; case S11: /* e.g. "+" */ - lx_pcre_ungetc(lx, c); return TOK_PLUS; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_PLUS; case S12: /* e.g. "*" */ - lx_pcre_ungetc(lx, c); return TOK_STAR; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_STAR; - case S13: /* e.g. "?" */ - lx_pcre_ungetc(lx, c); return TOK_OPT; + case S13: /* e.g. "\077" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OPT; case S14: /* e.g. "$" */ - lx_pcre_ungetc(lx, c); return TOK_END__NL; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_END__NL; case S15: /* e.g. "^" */ - lx_pcre_ungetc(lx, c); return TOK_START; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_START; case S16: /* e.g. ")" */ - lx_pcre_ungetc(lx, c); return TOK_CLOSE; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CLOSE; - case S17: /* e.g. "(?" */ + case S17: /* e.g. "(\077" */ switch ((unsigned char) c) { case '#': state = S19; break; + case '<': state = S20; break; case '!': case '&': - case '=': state = S20; break; - case 'P': state = S21; break; - case '<': state = S22; break; - default: lx_pcre_ungetc(lx, c); return lx->z = z4, TOK_FLAGS; + case '=': state = S21; break; + case 'P': state = S22; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z4, TOK_FLAGS; } break; case S18: /* e.g. "(*" */ - lx_pcre_ungetc(lx, c); return lx->z = z6, lx->z(lx); + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z6, lx->z(lx); - case S19: /* e.g. "(?#" */ - lx_pcre_ungetc(lx, c); return lx->z = z5, lx->z(lx); + case S19: /* e.g. "(\077#" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z5, lx->z(lx); - case S20: /* e.g. "\\b" */ - lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; - - case S21: /* e.g. "(?P" */ - switch ((unsigned char) c) { - case '>': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; - - case S22: /* e.g. "(?<" */ + case S20: /* e.g. "(\077<" */ switch ((unsigned char) c) { case '!': - case '=': state = S20; break; + case '=': state = S21; break; case 'A': case 'B': case 'C': @@ -2230,11 +2548,25 @@ z7(struct lx_pcre_lx *lx) case 'x': case 'y': case 'z': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S21: /* e.g. "\\B" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; + + case S22: /* e.g. "(\077P" */ + switch ((unsigned char) c) { + case '>': state = S21; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S23: /* e.g. "(?': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S24: /* e.g. "(?" */ - lx_pcre_ungetc(lx, c); return TOK_OPENCAPTURE; + case S24: /* e.g. "(\077" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OPENCAPTURE; case S25: /* e.g. "[^" */ switch ((unsigned char) c) { case ']': state = S27; break; - default: lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPINV; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUPINV; } break; case S26: /* e.g. "[]" */ - lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPCB; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUPCB; case S27: /* e.g. "[^]" */ - lx_pcre_ungetc(lx, c); return lx->z = z3, TOK_OPENGROUPINVCB; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z3, TOK_OPENGROUPINVCB; - case S28: /* e.g. "\\Q" */ - lx_pcre_ungetc(lx, c); return lx->z = z0, lx->z(lx); + case S28: /* e.g. "\\$" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_ESC; - case S29: /* e.g. "\\o" */ + case S29: /* e.g. "\\E" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z(lx); + + case S30: /* e.g. "\\z" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_END; + + case S31: /* e.g. "\\1" */ switch ((unsigned char) c) { - case '{': state = S47; break; - default: lx_pcre_ungetc(lx, c); return TOK_NOESC; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_UNSUPPORTED; } break; - case S30: /* e.g. "\\c" */ - state = S46; break; + case S32: /* e.g. "\\Q" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return lx->z = z0, lx->z(lx); - case S31: /* e.g. "\\i" */ - lx_pcre_ungetc(lx, c); return TOK_NOESC; + case S33: /* e.g. "\\c" */ + state = S48; break; - case S32: /* e.g. "\\x" */ + case S34: /* e.g. "\\o" */ + switch ((unsigned char) c) { + case '{': state = S46; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + } + break; + + case S35: /* e.g. "\\\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NOESC; + + case S36: /* e.g. "\\x" */ switch ((unsigned char) c) { case '{': state = S42; break; case '0': @@ -2361,37 +2720,17 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': state = S43; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S33: /* e.g. "\\0" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': state = S40; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; - } - break; - - case S34: /* e.g. "\\R" */ - lx_pcre_ungetc(lx, c); return TOK_EOL; - - case S35: /* e.g. "\\d" */ - lx_pcre_ungetc(lx, c); return TOK_NAMED__CLASS; - - case S36: /* e.g. "\\a" */ - lx_pcre_ungetc(lx, c); return TOK_ESC; + case S37: /* e.g. "\\D" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; - case S37: /* e.g. "\\E" */ - lx_pcre_ungetc(lx, c); return lx->z(lx); + case S38: /* e.g. "\\R" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_EOL; - case S38: /* e.g. "\\1" */ + case S39: /* e.g. "\\0" */ switch ((unsigned char) c) { case '0': case '1': @@ -2400,16 +2739,11 @@ z7(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': - case '8': - case '9': break; - default: lx_pcre_ungetc(lx, c); return TOK_UNSUPPORTED; + case '7': state = S40; break; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S39: /* e.g. "\\z" */ - lx_pcre_ungetc(lx, c); return TOK_END; - case S40: /* e.g. "\\00" */ switch ((unsigned char) c) { case '0': @@ -2420,12 +2754,12 @@ z7(struct lx_pcre_lx *lx) case '5': case '6': case '7': state = S41; break; - default: lx_pcre_ungetc(lx, c); return TOK_OCT; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; } break; case S41: /* e.g. "\\000" */ - lx_pcre_ungetc(lx, c); return TOK_OCT; + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_OCT; case S42: /* e.g. "\\x{" */ switch ((unsigned char) c) { @@ -2451,11 +2785,13 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': state = S45; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S43: /* e.g. "\\xa" */ + case S43: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -2479,14 +2815,14 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': state = S44; break; - default: lx_pcre_ungetc(lx, c); return TOK_HEX; + default: lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; } break; - case S44: /* e.g. "\\xaa" */ - lx_pcre_ungetc(lx, c); return TOK_HEX; + case S44: /* e.g. "\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_HEX; - case S45: /* e.g. "\\x{a" */ + case S45: /* e.g. "\\x{0" */ switch ((unsigned char) c) { case '}': state = S44; break; case '0': @@ -2511,14 +2847,13 @@ z7(struct lx_pcre_lx *lx) case 'd': case 'e': case 'f': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S46: /* e.g. "\\ca" */ - lx_pcre_ungetc(lx, c); return TOK_CONTROL; - - case S47: /* e.g. "\\o{" */ + case S46: /* e.g. "\\o{" */ switch ((unsigned char) c) { case '0': case '1': @@ -2527,12 +2862,14 @@ z7(struct lx_pcre_lx *lx) case '4': case '5': case '6': - case '7': state = S48; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + case '7': state = S47; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S48: /* e.g. "\\o{0" */ + case S47: /* e.g. "\\o{0" */ switch ((unsigned char) c) { case '}': state = S41; break; case '0': @@ -2543,43 +2880,29 @@ z7(struct lx_pcre_lx *lx) case '5': case '6': case '7': break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - default: - ; /* unreached */ - } - - switch (state) { - case S18: - case S19: - case S28: - case S37: - break; + case S48: /* e.g. "\\c\\x00" */ + lx_pcre_ungetc(lx, c); lx_pcre_dynpop(lx->buf_opaque); return TOK_CONTROL; default: - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } - break; - + ; /* unreached */ } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_INVALID; case S3: return TOK_NEWLINE; case S4: return TOK_WHITESPACE; case S5: return TOK_MAYBE_COMMENT; - case S6: return TOK_OPENCOUNT; - case S7: return TOK_OPENGROUP; + case S6: return lx->z = z1, TOK_OPENCOUNT; + case S7: return lx->z = z3, TOK_OPENGROUP; case S8: return TOK_OPEN; case S9: return TOK_ALT; case S10: return TOK_ANY; @@ -2589,33 +2912,59 @@ z7(struct lx_pcre_lx *lx) case S14: return TOK_END__NL; case S15: return TOK_START; case S16: return TOK_CLOSE; - case S17: return TOK_FLAGS; - case S18: return TOK_EOF; - case S19: return TOK_EOF; - case S20: return TOK_UNSUPPORTED; + case S17: return lx->z = z4, TOK_FLAGS; + case S18: return lx->z = z6, lx->z(lx); + case S19: return lx->z = z5, lx->z(lx); + case S21: return TOK_UNSUPPORTED; case S24: return TOK_OPENCAPTURE; - case S25: return TOK_OPENGROUPINV; - case S26: return TOK_OPENGROUPCB; - case S27: return TOK_OPENGROUPINVCB; - case S28: return TOK_EOF; - case S29: return TOK_NOESC; - case S30: return TOK_NOESC; - case S31: return TOK_NOESC; - case S32: return TOK_HEX; - case S33: return TOK_OCT; - case S34: return TOK_EOL; - case S35: return TOK_NAMED__CLASS; - case S36: return TOK_ESC; - case S37: return TOK_EOF; - case S38: return TOK_UNSUPPORTED; - case S39: return TOK_END; + case S25: return lx->z = z3, TOK_OPENGROUPINV; + case S26: return lx->z = z3, TOK_OPENGROUPCB; + case S27: return lx->z = z3, TOK_OPENGROUPINVCB; + case S28: return TOK_ESC; + case S29: return TOK_EOF; + case S30: return TOK_END; + case S31: return TOK_UNSUPPORTED; + case S32: return lx->z = z0, lx->z(lx); + case S33: return TOK_NOESC; + case S34: return TOK_NOESC; + case S35: return TOK_NOESC; + case S36: return TOK_HEX; + case S37: return TOK_NAMED__CLASS; + case S38: return TOK_EOL; + case S39: return TOK_OCT; case S40: return TOK_OCT; case S41: return TOK_OCT; case S43: return TOK_HEX; case S44: return TOK_HEX; - case S46: return TOK_CONTROL; - default: errno = EINVAL; return TOK_ERROR; + case S48: return TOK_CONTROL; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_pcre_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + switch (state) { + case S18: + case S19: + case S29: + case S32: + break; + + default: + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + break; + + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -3092,6 +3441,7 @@ lx_pcre_init(struct lx_pcre_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_pcre_dynpop; } enum lx_pcre_token diff --git a/src/libre/dialect/pcre/parser.c b/src/libre/dialect/pcre/parser.c index 78c11d795..e59fee020 100644 --- a/src/libre/dialect/pcre/parser.c +++ b/src/libre/dialect/pcre/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -325,7 +325,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta /* BEGINNING OF EXTRACT: FLAG_EXTENDED */ { -#line 666 "src/libre/parser.act" +#line 665 "src/libre/parser.act" ZIc = RE_EXTENDED; @@ -335,7 +335,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-union */ { -#line 801 "src/libre/parser.act" +#line 800 "src/libre/parser.act" (ZIo) = (ZIi) | (ZIc); @@ -356,7 +356,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta /* BEGINNING OF EXTRACT: FLAG_INSENSITIVE */ { -#line 662 "src/libre/parser.act" +#line 661 "src/libre/parser.act" ZIc = RE_ICASE; @@ -366,7 +366,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-union */ { -#line 801 "src/libre/parser.act" +#line 800 "src/libre/parser.act" (ZIo) = (ZIi) | (ZIc); @@ -381,7 +381,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta /* BEGINNING OF EXTRACT: FLAG_SINGLE */ { -#line 670 "src/libre/parser.act" +#line 669 "src/libre/parser.act" ZIc = RE_SINGLE; @@ -391,7 +391,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-union */ { -#line 801 "src/libre/parser.act" +#line 800 "src/libre/parser.act" (ZIo) = (ZIi) | (ZIc); @@ -406,7 +406,7 @@ p_expr_C_Cflags_C_Cflag__set(flags flags, lex_state lex_state, act_state act_sta ZIo = ZIi; /* BEGINNING OF ACTION: err-unknown-flag */ { -#line 743 "src/libre/parser.act" +#line 739 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EFLAG; @@ -451,7 +451,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -474,7 +474,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: CONTROL */ { -#line 448 "src/libre/parser.act" +#line 442 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] == 'c'); @@ -522,7 +522,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -538,7 +538,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -574,7 +574,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -632,7 +632,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -685,7 +685,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags { /* BEGINNING OF EXTRACT: UNSUPPORTED */ { -#line 429 "src/libre/parser.act" +#line 426 "src/libre/parser.act" /* handle \1-\9 back references */ if (lex_state->buf.a[0] == '\\' && lex_state->buf.a[1] != '\0' && lex_state->buf.a[2] == '\0') { @@ -707,7 +707,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -726,7 +726,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hliteral(flags /* END OF INLINE: 155 */ /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -762,7 +762,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIclass), (ZInode))) { goto ZL1; @@ -775,7 +775,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; goto ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms; /* END OF INLINE: expr::character-class::list-of-class-terms */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -822,7 +822,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -851,7 +851,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; goto ZL2_expr_C_Clist_Hof_Hpieces; /* END OF INLINE: expr::list-of-pieces */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -874,7 +874,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla { /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (*ZI290)); if ((ZInode) == NULL) { @@ -894,7 +894,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_NAMED; (ZIlower).u.named.class = (*ZI290); @@ -910,7 +910,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI291)); mark(&act_state->rangeend, &(ZIend)); @@ -920,7 +920,7 @@ p_293(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__cla /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -980,7 +980,7 @@ p_168(flags flags, lex_state lex_state, act_state act_state, err err) case (TOK_RANGE): /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI169 = '-'; ZI170 = lex_state->lx.start; @@ -1004,7 +1004,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-range */ { -#line 722 "src/libre/parser.act" +#line 718 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXRANGE; @@ -1043,7 +1043,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1069,7 +1069,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: CONTROL */ { -#line 448 "src/libre/parser.act" +#line 442 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] == 'c'); @@ -1124,7 +1124,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1163,7 +1163,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1224,7 +1224,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: NOESC */ { -#line 417 "src/libre/parser.act" +#line 412 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1251,7 +1251,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1307,7 +1307,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* BEGINNING OF EXTRACT: UNSUPPORTED */ { -#line 429 "src/libre/parser.act" +#line 426 "src/libre/parser.act" /* handle \1-\9 back references */ if (lex_state->buf.a[0] == '\\' && lex_state->buf.a[1] != '\0' && lex_state->buf.a[2] == '\0') { @@ -1329,7 +1329,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -1348,7 +1348,7 @@ p_expr_C_Cliteral(flags flags, lex_state lex_state, act_state act_state, err err /* END OF INLINE: 111 */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1381,7 +1381,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1413,7 +1413,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CONTROL */ { -#line 448 "src/libre/parser.act" +#line 442 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] == 'c'); @@ -1461,7 +1461,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -1486,7 +1486,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: ESC */ { -#line 391 "src/libre/parser.act" +#line 386 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1531,7 +1531,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: HEX */ { -#line 535 "src/libre/parser.act" +#line 527 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1598,7 +1598,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZI290 = DIALECT_CLASS(lex_state->buf.a); if (ZI290 == NULL) { @@ -1631,7 +1631,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: NOESC */ { -#line 417 "src/libre/parser.act" +#line 412 "src/libre/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1651,7 +1651,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1671,7 +1671,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: OCT */ { -#line 492 "src/libre/parser.act" +#line 484 "src/libre/parser.act" unsigned long u; char *s, *e; @@ -1733,7 +1733,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: UNSUPPORTED */ { -#line 429 "src/libre/parser.act" +#line 426 "src/libre/parser.act" /* handle \1-\9 back references */ if (lex_state->buf.a[0] == '\\' && lex_state->buf.a[1] != '\0' && lex_state->buf.a[2] == '\0') { @@ -1755,7 +1755,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -1801,7 +1801,7 @@ p_expr_C_Ccomment(flags flags, lex_state lex_state, act_state act_state, err err ADVANCE_LEXER; /* BEGINNING OF ACTION: err-invalid-comment */ { -#line 687 "src/libre/parser.act" +#line 683 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EBADCOMMENT; @@ -1835,7 +1835,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -1859,7 +1859,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_C_Crange_Hendpoint_Hclass(flags fl ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-class */ { -#line 845 "src/libre/parser.act" +#line 844 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_NAMED; (ZIr).u.named.class = (ZIid); @@ -1899,7 +1899,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUP */ { -#line 319 "src/libre/parser.act" +#line 318 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI181 = lex_state->lx.end; @@ -1913,7 +1913,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1940,7 +1940,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPCB */ { -#line 335 "src/libre/parser.act" +#line 334 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI200 = lex_state->lx.end; @@ -1954,7 +1954,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1967,7 +1967,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -1981,7 +1981,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -2003,7 +2003,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINV */ { -#line 327 "src/libre/parser.act" +#line 326 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI192 = lex_state->lx.end; @@ -2017,7 +2017,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2030,7 +2030,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -2087,7 +2087,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: OPENGROUPINVCB */ { -#line 343 "src/libre/parser.act" +#line 342 "src/libre/parser.act" ZIstart = lex_state->lx.start; ZI207 = lex_state->lx.end; @@ -2101,7 +2101,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2114,7 +2114,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ZItmp = ZInode; /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -2157,7 +2157,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-invert */ /* BEGINNING OF ACTION: make-literal-cbrak */ { -#line 886 "src/libre/parser.act" +#line 885 "src/libre/parser.act" (ZIcbrak) = ']'; @@ -2171,7 +2171,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZInode1))) { goto ZL1; @@ -2203,7 +2203,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI214 = ']'; ZI215 = lex_state->lx.start; @@ -2219,7 +2219,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -2238,7 +2238,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUPRANGE */ { -#line 361 "src/libre/parser.act" +#line 360 "src/libre/parser.act" ZIcrange = '-'; ZI217 = lex_state->lx.start; @@ -2254,7 +2254,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIrange) = ast_make_expr_literal(act_state->poolp, *flags, (ZIcrange)); if ((ZIrange) == NULL) { @@ -2266,7 +2266,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZItmp), (ZIrange))) { goto ZL4; @@ -2277,7 +2277,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIstart)); mark(&act_state->groupend, &(ZIend)); @@ -2295,7 +2295,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -2338,7 +2338,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI163 = lex_state->lx.start; @@ -2354,7 +2354,7 @@ p_expr_C_Ccharacter_Hclass_C_Crange_Hendpoint_Hend(flags flags, lex_state lex_st ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (ZIc); @@ -2401,7 +2401,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (*ZI314)); if ((ZInode) == NULL) { @@ -2421,7 +2421,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZI314); @@ -2437,7 +2437,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI } /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI315)); mark(&act_state->rangeend, &(ZIend)); @@ -2447,7 +2447,7 @@ p_317(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2522,7 +2522,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -2531,7 +2531,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: count-one */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -2578,7 +2578,7 @@ p_320(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI256 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -2592,7 +2592,7 @@ p_320(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI318)); mark(&act_state->countend, &(ZIend)); @@ -2602,7 +2602,7 @@ p_320(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((*ZIm) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -2656,7 +2656,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2677,7 +2677,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -2689,7 +2689,7 @@ ZL1:; /* END OF ACTION: err-expected-alts */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2722,7 +2722,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI261 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -2736,7 +2736,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI318)); mark(&act_state->countend, &(ZIend)); @@ -2746,7 +2746,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-unbounded */ { -#line 805 "src/libre/parser.act" +#line 804 "src/libre/parser.act" (ZIn) = AST_COUNT_UNBOUNDED; @@ -2755,7 +2755,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: count-unbounded */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -2783,7 +2783,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -2811,7 +2811,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 case (TOK_CLOSECOUNT): /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI259 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -2829,7 +2829,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI318)); mark(&act_state->countend, &(ZIend)); @@ -2839,7 +2839,7 @@ p_321(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI3 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -2885,7 +2885,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZIrstart = lex_state->lx.start; @@ -2906,7 +2906,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode1) == NULL) { @@ -2929,7 +2929,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (ZIc); @@ -2939,7 +2939,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI197 = '-'; ZI198 = lex_state->lx.start; @@ -2960,7 +2960,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp } /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -2999,7 +2999,7 @@ p_194(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF INLINE: 196 */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZItmp), (ZInode1))) { goto ZL1; @@ -3044,7 +3044,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, ADVANCE_LEXER; /* BEGINNING OF ACTION: re-flag-none */ { -#line 797 "src/libre/parser.act" +#line 796 "src/libre/parser.act" (ZIempty__pos) = RE_FLAGS_NONE; @@ -3053,7 +3053,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: re-flag-none */ /* BEGINNING OF ACTION: re-flag-none */ { -#line 797 "src/libre/parser.act" +#line 796 "src/libre/parser.act" (ZIempty__neg) = RE_FLAGS_NONE; @@ -3110,7 +3110,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-mask-re-flags */ { -#line 931 "src/libre/parser.act" +#line 926 "src/libre/parser.act" /* * Note: in cases like `(?i-i)`, the negative is @@ -3124,7 +3124,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: ast-mask-re-flags */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -3144,7 +3144,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-get-re-flags */ { -#line 919 "src/libre/parser.act" +#line 918 "src/libre/parser.act" (ZIflags) = *flags; @@ -3153,7 +3153,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: ast-get-re-flags */ /* BEGINNING OF ACTION: ast-mask-re-flags */ { -#line 931 "src/libre/parser.act" +#line 926 "src/libre/parser.act" /* * Note: in cases like `(?i-i)`, the negative is @@ -3172,7 +3172,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, } /* BEGINNING OF ACTION: ast-set-re-flags */ { -#line 923 "src/libre/parser.act" +#line 922 "src/libre/parser.act" *flags = (ZIflags); @@ -3197,7 +3197,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, { /* BEGINNING OF ACTION: err-expected-closeflags */ { -#line 750 "src/libre/parser.act" +#line 746 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEFLAGS; @@ -3209,7 +3209,7 @@ p_expr_C_Cflags(flags flags, lex_state lex_state, act_state act_state, err err, /* END OF ACTION: err-expected-closeflags */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -3250,7 +3250,7 @@ p_expr_C_Cpiece_C_Clist_Hof_Hcounts(flags flags, lex_state lex_state, act_state } /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -3275,7 +3275,7 @@ p_expr_C_Cpiece_C_Clist_Hof_Hcounts(flags flags, lex_state lex_state, act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -3292,7 +3292,7 @@ p_expr_C_Cpiece_C_Clist_Hof_Hcounts(flags flags, lex_state lex_state, act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: err-unsupported */ { -#line 764 "src/libre/parser.act" +#line 760 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EUNSUPPORTED; @@ -3384,7 +3384,7 @@ p_class_Hnamed(flags flags, lex_state lex_state, act_state act_state, err err, t case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -3408,7 +3408,7 @@ p_class_Hnamed(flags flags, lex_state lex_state, act_state act_state, err err, t ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (ZIid)); if ((ZInode) == NULL) { @@ -3439,7 +3439,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode1) = ast_make_expr_literal(act_state->poolp, *flags, (*ZIcbrak)); if ((ZInode1) == NULL) { @@ -3463,7 +3463,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIr).type = AST_ENDPOINT_LITERAL; (ZIr).u.literal.c = (unsigned char) (*ZIcbrak); @@ -3473,7 +3473,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI210 = '-'; ZI211 = lex_state->lx.start; @@ -3494,7 +3494,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs } /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIlower).type = AST_ENDPOINT_LITERAL; (ZIlower).u.literal.c = (unsigned char) (*ZIcbrak); @@ -3504,7 +3504,7 @@ p_209(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZIs /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -3566,7 +3566,7 @@ ZL2_expr_C_Clist_Hof_Halts:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIalts), (ZIa))) { goto ZL1; @@ -3585,7 +3585,7 @@ ZL2_expr_C_Clist_Hof_Halts:; goto ZL2_expr_C_Clist_Hof_Halts; /* END OF INLINE: expr::list-of-alts */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -3597,7 +3597,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -3629,7 +3629,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: OPENCOUNT */ { -#line 371 "src/libre/parser.act" +#line 370 "src/libre/parser.act" ZI318 = lex_state->lx.start; ZI319 = lex_state->lx.end; @@ -3645,7 +3645,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -3685,7 +3685,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-one */ { -#line 817 "src/libre/parser.act" +#line 816 "src/libre/parser.act" (ZIc) = ast_make_count(0, 1); @@ -3699,7 +3699,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-one-or-more */ { -#line 813 "src/libre/parser.act" +#line 812 "src/libre/parser.act" (ZIc) = ast_make_count(1, AST_COUNT_UNBOUNDED); @@ -3713,7 +3713,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -3732,7 +3732,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-count */ { -#line 701 "src/libre/parser.act" +#line 697 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCOUNT; @@ -3744,7 +3744,7 @@ ZL1:; /* END OF ACTION: err-expected-count */ /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -3774,7 +3774,7 @@ p_re__pcre(flags flags, lex_state lex_state, act_state act_state, err err, t_ast /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -3788,7 +3788,7 @@ p_re__pcre(flags flags, lex_state lex_state, act_state act_state, err err, t_ast } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -3814,7 +3814,7 @@ p_re__pcre(flags flags, lex_state lex_state, act_state act_state, err err, t_ast { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -3850,7 +3850,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -3860,7 +3860,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -3877,7 +3877,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-end */ { -#line 943 "src/libre/parser.act" +#line 942 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_END); if ((ZIe) == NULL) { @@ -3894,7 +3894,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-end-nl */ { -#line 950 "src/libre/parser.act" +#line 949 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_END); if ((ZIe) == NULL) { @@ -3922,7 +3922,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-bsr */ { -#line 789 "src/libre/parser.act" +#line 787 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIclass__bsr) = &class_bsr; @@ -3932,7 +3932,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-bsr */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIbsr) = ast_make_expr_named(act_state->poolp, *flags, (ZIclass__bsr)); if ((ZIbsr) == NULL) { @@ -3944,7 +3944,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZIcrlf) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZIcrlf) == NULL) { @@ -3956,7 +3956,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-concat */ /* BEGINNING OF ACTION: make-literal-cr */ { -#line 890 "src/libre/parser.act" +#line 889 "src/libre/parser.act" (ZIcr) = '\r'; @@ -3965,7 +3965,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: make-literal-cr */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIecr) = ast_make_expr_literal(act_state->poolp, *flags, (ZIcr)); if ((ZIecr) == NULL) { @@ -3977,7 +3977,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: make-literal-nl */ { -#line 894 "src/libre/parser.act" +#line 893 "src/libre/parser.act" (ZInl) = '\n'; @@ -3986,7 +3986,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: make-literal-nl */ /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIenl) = ast_make_expr_literal(act_state->poolp, *flags, (ZInl)); if ((ZIenl) == NULL) { @@ -3998,7 +3998,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcrlf), (ZIecr))) { goto ZL1; @@ -4009,7 +4009,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-add-concat */ /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcrlf), (ZIenl))) { goto ZL1; @@ -4020,7 +4020,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-add-concat */ /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZIe) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -4032,7 +4032,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-alt */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIe), (ZIcrlf))) { goto ZL1; @@ -4043,7 +4043,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIe), (ZIbsr))) { goto ZL1; @@ -4063,7 +4063,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-get-re-flags */ { -#line 919 "src/libre/parser.act" +#line 918 "src/libre/parser.act" (ZIflags) = *flags; @@ -4072,7 +4072,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-get-re-flags */ /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -4086,7 +4086,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e } /* BEGINNING OF ACTION: ast-set-re-flags */ { -#line 923 "src/libre/parser.act" +#line 922 "src/libre/parser.act" *flags = (ZIflags); @@ -4095,7 +4095,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-set-re-flags */ /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZIe) = ast_make_expr_group(act_state->poolp, *flags, (ZIg), (ZIid)); if ((ZIe) == NULL) { @@ -4119,7 +4119,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-anchor-start */ { -#line 936 "src/libre/parser.act" +#line 935 "src/libre/parser.act" (ZIe) = ast_make_expr_anchor(act_state->poolp, *flags, AST_ANCHOR_START); if ((ZIe) == NULL) { @@ -4178,7 +4178,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -4190,7 +4190,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -4223,7 +4223,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZInode) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -4244,7 +4244,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -4287,7 +4287,7 @@ p_expr_C_Ctype(flags flags, lex_state lex_state, act_state act_state, err err, t } /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -4299,7 +4299,7 @@ p_expr_C_Ctype(flags flags, lex_state lex_state, act_state act_state, err err, t /* END OF ACTION: ast-make-alt */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZInode), (ZIclass))) { goto ZL1; @@ -4319,7 +4319,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/pcre/parser.h b/src/libre/dialect/pcre/parser.h index c0cfbabe3..84ef34223 100644 --- a/src/libre/dialect/pcre/parser.h +++ b/src/libre/dialect/pcre/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__pcre(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/pcre/parser.h" diff --git a/src/libre/dialect/sql/lexer.c b/src/libre/dialect/sql/lexer.c index 6c35bf800..87459e786 100644 --- a/src/libre/dialect/sql/lexer.c +++ b/src/libre/dialect/sql/lexer.c @@ -12,11 +12,31 @@ static enum lx_sql_token z0(struct lx_sql_lx *lx); static enum lx_sql_token z1(struct lx_sql_lx *lx); static enum lx_sql_token z2(struct lx_sql_lx *lx); +static int +lx_sql_advance_end(struct lx_sql_lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif static int -lx_getc(struct lx_sql_lx *lx) +lx_sql_getc(struct lx_sql_lx *lx) { int c; @@ -32,18 +52,19 @@ lx_getc(struct lx_sql_lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_sql_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_sql_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_sql_getc((struct lx_sql_lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -52,10 +73,7 @@ lx_sql_ungetc(struct lx_sql_lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -107,6 +125,17 @@ lx_sql_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_sql_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_sql_dynclear(void *buf_opaque) { @@ -146,29 +175,28 @@ lx_sql_dynfree(void *buf_opaque) static enum lx_sql_token z0(struct lx_sql_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case ',': state = S1; break; case '0': @@ -182,12 +210,14 @@ z0(struct lx_sql_lx *lx) case '8': case '9': state = S2; break; case '}': state = S3; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S1: /* e.g. "," */ - lx_sql_ungetc(lx, c); return TOK_SEP; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_SEP; case S2: /* e.g. "0" */ switch ((unsigned char) c) { @@ -201,64 +231,70 @@ z0(struct lx_sql_lx *lx) case '7': case '8': case '9': break; - default: lx_sql_ungetc(lx, c); return TOK_COUNT; + default: lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_COUNT; } break; case S3: /* e.g. "}" */ - lx_sql_ungetc(lx, c); return lx->z = z2, TOK_CLOSECOUNT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSECOUNT; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_SEP; + case S2: return TOK_COUNT; + case S3: return lx->z = z2, TOK_CLOSECOUNT; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_SEP; - case S2: return TOK_COUNT; - case S3: return TOK_CLOSECOUNT; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_sql_token z1(struct lx_sql_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, - S30, S31, S32, S33, S34, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, + S30, S31, S32, S33, S34 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '[': state = S1; break; case '-': state = S3; break; @@ -271,21 +307,21 @@ z1(struct lx_sql_lx *lx) case S1: /* e.g. "[" */ switch ((unsigned char) c) { case ':': state = S6; break; - default: lx_sql_ungetc(lx, c); return TOK_CHAR; + default: lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CHAR; } break; case S2: /* e.g. "\\x00" */ - lx_sql_ungetc(lx, c); return TOK_CHAR; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CHAR; case S3: /* e.g. "-" */ - lx_sql_ungetc(lx, c); return TOK_RANGE; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_RANGE; case S4: /* e.g. "^" */ - lx_sql_ungetc(lx, c); return TOK_INVERT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_INVERT; case S5: /* e.g. "]" */ - lx_sql_ungetc(lx, c); return lx->z = z2, TOK_CLOSEGROUP; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z2, TOK_CLOSEGROUP; case S6: /* e.g. "[:" */ switch ((unsigned char) c) { @@ -295,94 +331,120 @@ z1(struct lx_sql_lx *lx) case 'A': state = S10; break; case 'L': state = S11; break; case 'U': state = S12; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S7: /* e.g. "[:S" */ switch ((unsigned char) c) { case 'P': state = S32; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S8: /* e.g. "[:W" */ switch ((unsigned char) c) { case 'H': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S9: /* e.g. "[:D" */ switch ((unsigned char) c) { case 'I': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S10: /* e.g. "[:A" */ switch ((unsigned char) c) { case 'L': state = S20; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S11: /* e.g. "[:L" */ switch ((unsigned char) c) { case 'O': state = S19; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S12: /* e.g. "[:U" */ switch ((unsigned char) c) { case 'P': state = S13; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S13: /* e.g. "[:UP" */ switch ((unsigned char) c) { case 'P': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S14: /* e.g. "[:LOW" */ switch ((unsigned char) c) { case 'E': state = S15; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S15: /* e.g. "[:LOWE" */ switch ((unsigned char) c) { case 'R': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S16: /* e.g. "[:ALPHA" */ switch ((unsigned char) c) { case ':': state = S17; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S17: /* e.g. "[:ALPHA:" */ switch ((unsigned char) c) { case ']': state = S18; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S18: /* e.g. "[:ALPHA:]" */ - lx_sql_ungetc(lx, c); return TOK_NAMED__CLASS; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_NAMED__CLASS; case S19: /* e.g. "[:LO" */ switch ((unsigned char) c) { case 'W': state = S14; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; @@ -390,160 +452,196 @@ z1(struct lx_sql_lx *lx) switch ((unsigned char) c) { case 'N': state = S21; break; case 'P': state = S22; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S21: /* e.g. "[:ALN" */ switch ((unsigned char) c) { case 'U': state = S24; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S22: /* e.g. "[:ALP" */ switch ((unsigned char) c) { case 'H': state = S23; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S23: /* e.g. "[:ALPH" */ switch ((unsigned char) c) { case 'A': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S24: /* e.g. "[:ALNU" */ switch ((unsigned char) c) { case 'M': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S25: /* e.g. "[:DI" */ switch ((unsigned char) c) { case 'G': state = S26; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S26: /* e.g. "[:DIG" */ switch ((unsigned char) c) { case 'I': state = S27; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S27: /* e.g. "[:DIGI" */ switch ((unsigned char) c) { case 'T': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S28: /* e.g. "[:WH" */ switch ((unsigned char) c) { case 'I': state = S29; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S29: /* e.g. "[:WHI" */ switch ((unsigned char) c) { case 'T': state = S30; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S30: /* e.g. "[:WHIT" */ switch ((unsigned char) c) { case 'E': state = S31; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S31: /* e.g. "[:WHITE" */ switch ((unsigned char) c) { case 'S': state = S7; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S32: /* e.g. "[:SP" */ switch ((unsigned char) c) { case 'A': state = S33; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S33: /* e.g. "[:SPA" */ switch ((unsigned char) c) { case 'C': state = S34; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; case S34: /* e.g. "[:SPAC" */ switch ((unsigned char) c) { case 'E': state = S16; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; case S2: return TOK_CHAR; case S3: return TOK_RANGE; case S4: return TOK_INVERT; - case S5: return TOK_CLOSEGROUP; + case S5: return lx->z = z2, TOK_CLOSEGROUP; case S18: return TOK_NAMED__CLASS; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_sql_token z2(struct lx_sql_lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '{': state = S2; break; case '[': state = S3; break; @@ -560,56 +658,48 @@ z2(struct lx_sql_lx *lx) break; case S1: /* e.g. "\\x00" */ - lx_sql_ungetc(lx, c); return TOK_CHAR; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "{" */ - lx_sql_ungetc(lx, c); return lx->z = z0, TOK_OPENCOUNT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z0, TOK_OPENCOUNT; case S3: /* e.g. "[" */ - lx_sql_ungetc(lx, c); return lx->z = z1, TOK_OPENGROUP; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return lx->z = z1, TOK_OPENGROUP; case S4: /* e.g. "|" */ - lx_sql_ungetc(lx, c); return TOK_ALT; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_ALT; case S5: /* e.g. "+" */ - lx_sql_ungetc(lx, c); return TOK_PLUS; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_PLUS; case S6: /* e.g. "*" */ - lx_sql_ungetc(lx, c); return TOK_STAR; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_STAR; - case S7: /* e.g. "?" */ - lx_sql_ungetc(lx, c); return TOK_OPT; + case S7: /* e.g. "\077" */ + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_OPT; case S8: /* e.g. ")" */ - lx_sql_ungetc(lx, c); return TOK_CLOSESUB; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_CLOSESUB; case S9: /* e.g. "(" */ - lx_sql_ungetc(lx, c); return TOK_OPENSUB; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_OPENSUB; case S10: /* e.g. "%" */ - lx_sql_ungetc(lx, c); return TOK_MANY; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_MANY; case S11: /* e.g. "_" */ - lx_sql_ungetc(lx, c); return TOK_ANY; + lx_sql_ungetc(lx, c); lx_sql_dynpop(lx->buf_opaque); return TOK_ANY; default: ; /* unreached */ } - - if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, (char)c)) { - return TOK_ERROR; - } - } } - lx->lgetc = NULL; - + /* end states */ switch (state) { - case NONE: return TOK_EOF; case S1: return TOK_CHAR; - case S2: return TOK_OPENCOUNT; - case S3: return TOK_OPENGROUP; + case S2: return lx->z = z0, TOK_OPENCOUNT; + case S3: return lx->z = z1, TOK_OPENGROUP; case S4: return TOK_ALT; case S5: return TOK_PLUS; case S6: return TOK_STAR; @@ -618,8 +708,23 @@ z2(struct lx_sql_lx *lx) case S9: return TOK_OPENSUB; case S10: return TOK_MANY; case S11: return TOK_ANY; - default: errno = EINVAL; return TOK_ERROR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_sql_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } + + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return TOK_ERROR; + } + } + + lx->lgetc = NULL; + + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -747,6 +852,7 @@ lx_sql_init(struct lx_sql_lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_sql_dynpop; } enum lx_sql_token diff --git a/src/libre/dialect/sql/parser.c b/src/libre/dialect/sql/parser.c index d380c5b7d..e7a4c2e75 100644 --- a/src/libre/dialect/sql/parser.c +++ b/src/libre/dialect/sql/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 275 "src/libre/parser.act" +#line 22 "src/libre/parser.act" #include @@ -311,7 +311,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: INVERT */ { -#line 303 "src/libre/parser.act" +#line 302 "src/libre/parser.act" ZI203 = '^'; @@ -337,7 +337,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI114 = lex_state->lx.start; @@ -353,7 +353,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -365,7 +365,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hhead(flags flags, lex_state lex_state, act_ /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZIclass), (ZInode))) { goto ZL1; @@ -401,7 +401,7 @@ p_re__sql(flags flags, lex_state lex_state, act_state act_state, err err, t_ast_ /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -415,7 +415,7 @@ p_re__sql(flags flags, lex_state lex_state, act_state act_state, err err, t_ast_ } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZInode) = ast_make_expr_group(act_state->poolp, *flags, (ZIe), (ZIid)); if ((ZInode) == NULL) { @@ -441,7 +441,7 @@ p_re__sql(flags flags, lex_state lex_state, act_state act_state, err err, t_ast_ { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 757 "src/libre/parser.act" +#line 753 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXEOF; @@ -484,7 +484,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIclass), (ZInode))) { goto ZL4; @@ -499,7 +499,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; { /* BEGINNING OF ACTION: err-expected-term */ { -#line 694 "src/libre/parser.act" +#line 690 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXTERM; @@ -522,7 +522,7 @@ ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms:; goto ZL2_expr_C_Ccharacter_Hclass_C_Clist_Hof_Hclass_Hterms; /* END OF INLINE: expr::character-class::list-of-class-terms */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -560,7 +560,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; } /* BEGINNING OF ACTION: ast-add-concat */ { -#line 1041 "src/libre/parser.act" +#line 1040 "src/libre/parser.act" if (!ast_add_expr_concat((ZIcat), (ZIa))) { goto ZL1; @@ -579,7 +579,7 @@ ZL2_expr_C_Clist_Hof_Hpieces:; goto ZL2_expr_C_Clist_Hof_Hpieces; /* END OF INLINE: expr::list-of-pieces */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -606,7 +606,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hterm(flags flags, lex_state lex_state, act_ /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -670,7 +670,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state case (TOK_OPENGROUP): /* BEGINNING OF EXTRACT: OPENGROUP */ { -#line 319 "src/libre/parser.act" +#line 318 "src/libre/parser.act" ZIopen__start = lex_state->lx.start; ZIopen__end = lex_state->lx.end; @@ -688,7 +688,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZIclass) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZIclass) == NULL) { @@ -713,7 +713,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI154 = ']'; ZIclose__start = lex_state->lx.start; @@ -729,7 +729,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIopen__start)); mark(&act_state->groupend, &(ZIclose__end)); @@ -748,7 +748,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* BEGINNING OF EXTRACT: INVERT */ { -#line 303 "src/libre/parser.act" +#line 302 "src/libre/parser.act" ZI158 = '^'; @@ -760,7 +760,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZImask) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZImask) == NULL) { @@ -789,7 +789,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state case (TOK_CLOSEGROUP): /* BEGINNING OF EXTRACT: CLOSEGROUP */ { -#line 351 "src/libre/parser.act" +#line 350 "src/libre/parser.act" ZI163 = ']'; ZIclose__start = lex_state->lx.start; @@ -809,7 +809,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-group */ { -#line 768 "src/libre/parser.act" +#line 767 "src/libre/parser.act" mark(&act_state->groupstart, &(ZIopen__start)); mark(&act_state->groupend, &(ZIclose__end)); @@ -823,7 +823,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -839,7 +839,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF INLINE: 162 */ /* BEGINNING OF ACTION: ast-make-subtract */ { -#line 960 "src/libre/parser.act" +#line 959 "src/libre/parser.act" (ZInode) = ast_make_expr_subtract(act_state->poolp, *flags, (ZIclass), (ZImask)); if ((ZInode) == NULL) { @@ -862,7 +862,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state { /* BEGINNING OF ACTION: err-expected-closegroup */ { -#line 729 "src/libre/parser.act" +#line 725 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCLOSEGROUP; @@ -874,7 +874,7 @@ p_expr_C_Ccharacter_Hclass(flags flags, lex_state lex_state, act_state act_state /* END OF ACTION: err-expected-closegroup */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -917,7 +917,7 @@ p_expr_C_Cpiece(flags flags, lex_state lex_state, act_state act_state, err err, } /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); @@ -954,7 +954,7 @@ p_expr(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__ex { /* BEGINNING OF ACTION: ast-make-alt */ { -#line 868 "src/libre/parser.act" +#line 867 "src/libre/parser.act" (ZInode) = ast_make_expr_alt(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -975,7 +975,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -987,7 +987,7 @@ ZL1:; /* END OF ACTION: err-expected-alts */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -1019,7 +1019,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZIc = '-'; ZI117 = lex_state->lx.start; @@ -1035,7 +1035,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (ZIc)); if ((ZInode) == NULL) { @@ -1047,7 +1047,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-make-literal */ /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((*ZIclass), (ZInode))) { goto ZL1; @@ -1058,7 +1058,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp /* END OF ACTION: ast-add-alt */ /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1105,7 +1105,7 @@ p_204(flags flags, lex_state lex_state, act_state act_state, err err, t_ast__exp { /* BEGINNING OF ACTION: ast-make-invert */ { -#line 995 "src/libre/parser.act" +#line 966 "src/libre/parser.act" struct ast_expr *any; @@ -1167,7 +1167,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI { /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZInode) = ast_make_expr_literal(act_state->poolp, *flags, (*ZI205)); if ((ZInode) == NULL) { @@ -1192,7 +1192,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIa).type = AST_ENDPOINT_LITERAL; (ZIa).u.literal.c = (unsigned char) (*ZI205); @@ -1202,7 +1202,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF EXTRACT: RANGE */ { -#line 309 "src/libre/parser.act" +#line 308 "src/libre/parser.act" ZI136 = '-'; ZI137 = lex_state->lx.start; @@ -1220,7 +1220,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI case (TOK_CHAR): /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1243,7 +1243,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-range-endpoint-literal */ { -#line 840 "src/libre/parser.act" +#line 839 "src/libre/parser.act" (ZIz).type = AST_ENDPOINT_LITERAL; (ZIz).u.literal.c = (unsigned char) (ZIcz); @@ -1253,7 +1253,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: ast-range-endpoint-literal */ /* BEGINNING OF ACTION: mark-range */ { -#line 773 "src/libre/parser.act" +#line 772 "src/libre/parser.act" mark(&act_state->rangestart, &(*ZI206)); mark(&act_state->rangeend, &(ZIend)); @@ -1263,7 +1263,7 @@ p_208(flags flags, lex_state lex_state, act_state act_state, err err, t_char *ZI /* END OF ACTION: mark-range */ /* BEGINNING OF ACTION: ast-make-range */ { -#line 1007 "src/libre/parser.act" +#line 1004 "src/libre/parser.act" unsigned char lower, upper; @@ -1321,7 +1321,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI176 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -1335,7 +1335,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI209)); mark(&act_state->countend, &(ZIend)); @@ -1345,7 +1345,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((*ZIm) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -1376,7 +1376,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -1408,7 +1408,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 case (TOK_CLOSECOUNT): /* BEGINNING OF EXTRACT: CLOSECOUNT */ { -#line 379 "src/libre/parser.act" +#line 378 "src/libre/parser.act" ZI179 = lex_state->lx.start; ZIend = lex_state->lx.end; @@ -1426,7 +1426,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 ADVANCE_LEXER; /* BEGINNING OF ACTION: mark-count */ { -#line 778 "src/libre/parser.act" +#line 777 "src/libre/parser.act" mark(&act_state->countstart, &(*ZI209)); mark(&act_state->countend, &(ZIend)); @@ -1436,7 +1436,7 @@ p_211(flags flags, lex_state lex_state, act_state act_state, err err, t_pos *ZI2 /* END OF ACTION: mark-count */ /* BEGINNING OF ACTION: count-range */ { -#line 825 "src/libre/parser.act" +#line 824 "src/libre/parser.act" if ((ZIn) < (*ZIm)) { err->e = RE_ENEGCOUNT; @@ -1486,7 +1486,7 @@ ZL2_expr_C_Clist_Hof_Halts:; } /* BEGINNING OF ACTION: ast-add-alt */ { -#line 1047 "src/libre/parser.act" +#line 1046 "src/libre/parser.act" if (!ast_add_expr_alt((ZIalts), (ZIa))) { goto ZL1; @@ -1505,7 +1505,7 @@ ZL2_expr_C_Clist_Hof_Halts:; goto ZL2_expr_C_Clist_Hof_Halts; /* END OF INLINE: expr::list-of-alts */ } - /*UNREACHED*/ + /* UNREACHED */ default: break; } @@ -1517,7 +1517,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-alts */ { -#line 715 "src/libre/parser.act" +#line 711 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXALTS; @@ -1549,7 +1549,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, /* BEGINNING OF EXTRACT: OPENCOUNT */ { -#line 371 "src/libre/parser.act" +#line 370 "src/libre/parser.act" ZI209 = lex_state->lx.start; ZI210 = lex_state->lx.end; @@ -1565,7 +1565,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, case (TOK_COUNT): /* BEGINNING OF EXTRACT: COUNT */ { -#line 636 "src/libre/parser.act" +#line 627 "src/libre/parser.act" unsigned long u; char *e; @@ -1605,7 +1605,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-one */ { -#line 817 "src/libre/parser.act" +#line 816 "src/libre/parser.act" (ZIc) = ast_make_count(0, 1); @@ -1619,7 +1619,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-one-or-more */ { -#line 813 "src/libre/parser.act" +#line 812 "src/libre/parser.act" (ZIc) = ast_make_count(1, AST_COUNT_UNBOUNDED); @@ -1633,7 +1633,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, ADVANCE_LEXER; /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -1646,7 +1646,7 @@ p_expr_C_Cpiece_C_Ccount(flags flags, lex_state lex_state, act_state act_state, { /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -1663,7 +1663,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-count */ { -#line 701 "src/libre/parser.act" +#line 697 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXCOUNT; @@ -1675,7 +1675,7 @@ ZL1:; /* END OF ACTION: err-expected-count */ /* BEGINNING OF ACTION: count-one */ { -#line 821 "src/libre/parser.act" +#line 820 "src/libre/parser.act" (ZIc) = ast_make_count(1, 1); @@ -1704,7 +1704,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -1714,7 +1714,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIe) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -1734,7 +1734,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* BEGINNING OF EXTRACT: CHAR */ { -#line 579 "src/libre/parser.act" +#line 575 "src/libre/parser.act" /* the first byte may be '\x00' */ assert(lex_state->buf.a[1] == '\0'); @@ -1753,7 +1753,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-literal */ { -#line 875 "src/libre/parser.act" +#line 874 "src/libre/parser.act" (ZIe) = ast_make_expr_literal(act_state->poolp, *flags, (ZIa)); if ((ZIe) == NULL) { @@ -1774,7 +1774,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: class-any */ { -#line 784 "src/libre/parser.act" +#line 782 "src/libre/parser.act" /* TODO: or the unicode equivalent */ (ZIa) = (*flags & RE_SINGLE) ? &class_any : &class_notnl; @@ -1784,7 +1784,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: class-any */ /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZIg) = ast_make_expr_named(act_state->poolp, *flags, (ZIa)); if ((ZIg) == NULL) { @@ -1796,7 +1796,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: ast-make-named */ /* BEGINNING OF ACTION: count-zero-or-more */ { -#line 809 "src/libre/parser.act" +#line 808 "src/libre/parser.act" (ZIc) = ast_make_count(0, AST_COUNT_UNBOUNDED); @@ -1805,7 +1805,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e /* END OF ACTION: count-zero-or-more */ /* BEGINNING OF ACTION: ast-make-piece */ { -#line 898 "src/libre/parser.act" +#line 897 "src/libre/parser.act" if ((ZIc).min == 0 && (ZIc).max == 0) { (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); @@ -1832,7 +1832,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e ADVANCE_LEXER; /* BEGINNING OF ACTION: make-group-id */ { -#line 882 "src/libre/parser.act" +#line 881 "src/libre/parser.act" (ZIid) = act_state->group_id++; @@ -1846,7 +1846,7 @@ p_expr_C_Cpiece_C_Catom(flags flags, lex_state lex_state, act_state act_state, e } /* BEGINNING OF ACTION: ast-make-group */ { -#line 912 "src/libre/parser.act" +#line 911 "src/libre/parser.act" (ZIe) = ast_make_expr_group(act_state->poolp, *flags, (ZIg), (ZIid)); if ((ZIe) == NULL) { @@ -1884,7 +1884,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-atom */ { -#line 708 "src/libre/parser.act" +#line 704 "src/libre/parser.act" if (err->e == RE_ESUCCESS) { err->e = RE_EXATOM; @@ -1896,7 +1896,7 @@ ZL1:; /* END OF ACTION: err-expected-atom */ /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZIe) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZIe) == NULL) { @@ -1932,7 +1932,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hnamed(flags flags, lex_state lex_state, act case (TOK_NAMED__CLASS): /* BEGINNING OF EXTRACT: NAMED_CLASS */ { -#line 648 "src/libre/parser.act" +#line 647 "src/libre/parser.act" ZIid = DIALECT_CLASS(lex_state->buf.a); if (ZIid == NULL) { @@ -1956,7 +1956,7 @@ p_expr_C_Ccharacter_Hclass_C_Cclass_Hnamed(flags flags, lex_state lex_state, act ADVANCE_LEXER; /* BEGINNING OF ACTION: ast-make-named */ { -#line 1034 "src/libre/parser.act" +#line 1033 "src/libre/parser.act" (ZInode) = ast_make_expr_named(act_state->poolp, *flags, (ZIid)); if ((ZInode) == NULL) { @@ -1986,7 +1986,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-concat */ { -#line 861 "src/libre/parser.act" +#line 860 "src/libre/parser.act" (ZInode) = ast_make_expr_concat(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2007,7 +2007,7 @@ p_expr_C_Calt(flags flags, lex_state lex_state, act_state act_state, err err, t_ { /* BEGINNING OF ACTION: ast-make-empty */ { -#line 854 "src/libre/parser.act" +#line 853 "src/libre/parser.act" (ZInode) = ast_make_expr_empty(act_state->poolp, *flags); if ((ZInode) == NULL) { @@ -2032,7 +2032,7 @@ ZL0:; /* BEGINNING OF TRAILER */ -#line 1207 "src/libre/parser.act" +#line 1052 "src/libre/parser.act" static int diff --git a/src/libre/dialect/sql/parser.h b/src/libre/dialect/sql/parser.h index c5e885439..7825ae3af 100644 --- a/src/libre/dialect/sql/parser.h +++ b/src/libre/dialect/sql/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 292 "src/libre/parser.act" +#line 281 "src/libre/parser.act" #include @@ -28,7 +28,7 @@ extern void p_re__sql(flags, lex_state, act_state, err, t_ast__expr *); /* BEGINNING OF TRAILER */ -#line 1209 "src/libre/parser.act" +#line 1207 "src/libre/parser.act" #line 35 "src/libre/dialect/sql/parser.h" diff --git a/src/libre/libre.syms b/src/libre/libre.syms index 9d381cb0f..b04833777 100644 --- a/src/libre/libre.syms +++ b/src/libre/libre.syms @@ -4,6 +4,7 @@ re_flags re_strerror re_perror re_is_anchored +re_interpolate_groups ast_print ast_print_dot diff --git a/src/libre/re_interpolate_groups.c b/src/libre/re_interpolate_groups.c new file mode 100644 index 000000000..52b61f116 --- /dev/null +++ b/src/libre/re_interpolate_groups.c @@ -0,0 +1,198 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define OUT_CHAR(c) \ + do if (outs != NULL) { \ + if (outn < 1) goto overflow; \ + *outs++ = (c); \ + outn--; \ + } while (0) + +#define OUT_GROUP(s) \ + do if (outs != NULL) { \ + size_t n = strlen((s)); \ + if (outn < n) goto overflow; \ + (void) memcpy(outs, s, n); \ + outs += n; \ + outn -= n; \ + } while (0) + +bool +re_interpolate_groups(const char *fmt, char esc, + const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent, + char *outs, size_t outn, + struct re_pos *start, struct re_pos *end) +{ + unsigned group; // 0 meaning group0, 1 meaning groupv[0], etc + char *outs_orig; + const char *p; + + enum { + STATE_LIT, + STATE_ESC, + STATE_DIGIT + } state; + + assert(esc != '\0'); + assert(group0 != NULL || groupc == 0); + assert(groupc < UINT_MAX / 10 - 1); + assert(outs != NULL || outn == 0); + + state = STATE_LIT; + group = 0; + + outs_orig = outn > 0 ? outs : NULL; + + if (start != NULL) { + start->byte = 0; + } + + p = fmt; + + do { + switch (state) { + case STATE_LIT: + if (*p == '\0') { + break; + } + + if (*p == esc) { + if (start != NULL) { + start->byte = p - fmt; + } + + state = STATE_ESC; + continue; + } + + OUT_CHAR(*p); + continue; + + case STATE_ESC: + if (*p == '\0') { + goto error; + } + + if (*p == esc) { + OUT_CHAR(esc); + state = STATE_LIT; + continue; + } + + if (isdigit((unsigned char) *p)) { + group = *p - '0'; + state = STATE_DIGIT; + continue; + } + + goto error; + + case STATE_DIGIT: + if (isdigit((unsigned char) *p)) { + group *= 10; + group += *p - '0'; + + /* + * We need to handle numeric overflow somehow here, + * as we would with using strtol() or similar. But + * we don't need to distinguish this as a special + * error code, semantically it's the same as a group + * that doesn't exist. + * + * groupc + 1 is always out of bounds. So we cap to that, + * using it as a simple way to avoid needing to handle + * numeric overflow for subsequent digits. This assumes + * groupc *= 10 is <= UINT_MAX. + */ + if (group > groupc) { + group = groupc + 1; + } + continue; + } + + if (group == 0) { + OUT_GROUP(group0); + } else if (group <= groupc) { + assert(groupv[group - 1] != NULL); + OUT_GROUP(groupv[group - 1]); + } else if (nonexistent == NULL) { + /* + * We could indicate this independently from syntax errors, + * with some way to return different error codes. + * + * But there's no need, you can pre-check the fmt syntax + * by running ahead of time with groupc == 0 and pass + * nonexistent != NULL, because that eliminates the + * possibility for group-related errors. + */ + goto error; + } else { + OUT_GROUP(nonexistent); + } + + group = 0; + state = STATE_LIT; + + if (*p == '\0') { + break; + } + + if (*p == esc) { + if (start != NULL) { + start->byte = p - fmt; + } + + state = STATE_ESC; + continue; + } + + OUT_CHAR(*p); + continue; + + default: + assert(!"unreached"); + goto error; + } + } while (*p != '\0' && p++); + + if (state != STATE_LIT) { + goto error; + } + + OUT_CHAR('\0'); + + return true; + +overflow: + + /* we're blaming the entire fmt string for overflow */ + if (start != NULL) { + start->byte = 0; + } + +error: + + if (end != NULL) { + end->byte = p - fmt; + } + + if (outs_orig != NULL) { + *outs_orig = '\0'; + } + + return false; +} + diff --git a/src/lx/lexer.c b/src/lx/lexer.c index 03bfbf463..ed7ca6d78 100644 --- a/src/lx/lexer.c +++ b/src/lx/lexer.c @@ -14,6 +14,26 @@ static enum lx_token z2(struct lx *lx); static enum lx_token z3(struct lx *lx); static enum lx_token z4(struct lx *lx); +static int +lx_advance_end(struct lx *lx, int c) +{ + lx->end.byte++; + lx->end.col++; + if (c == '\n') { + lx->end.line++; + lx->end.saved_col = lx->end.col - 1; + lx->end.col = 1; + } + if (lx->push != NULL) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { + return 0; + } + } + return 1; +} + +/* This wrapper manages one character of lookahead/pushback + * and the line, column, and byte offsets. */ #if __STDC_VERSION__ >= 199901L inline #endif @@ -34,18 +54,19 @@ lx_getc(struct lx *lx) } } - lx->end.byte++; - lx->end.col++; - - if (c == '\n') { - lx->end.line++; - lx->end.saved_col = lx->end.col - 1; - lx->end.col = 1; - } + if (!lx_advance_end(lx, c)) { return EOF; } return c; } +/* This wrapper adapts calling lx_getc to the interface + * in libfsm's generated code. */ +static int +fsm_getc(void *getc_opaque) +{ + return lx_getc((struct lx *)getc_opaque); +} + #if __STDC_VERSION__ >= 199901L inline #endif @@ -54,10 +75,7 @@ lx_ungetc(struct lx *lx, int c) { assert(lx != NULL); assert(lx->c == EOF); - lx->c = c; - - lx->end.byte--; lx->end.col--; @@ -67,13 +85,20 @@ lx_ungetc(struct lx *lx, int c) } } +/* Get a character from fgetc and push it to the buffer */ int lx_fgetc(struct lx *lx) { assert(lx != NULL); assert(lx->getc_opaque != NULL); - return fgetc(lx->getc_opaque); + const int c = fgetc(lx->getc_opaque); + if (c == EOF) { + lx->c = EOF; + return EOF; + } else { + return c; + } } int @@ -118,6 +143,17 @@ lx_dynpush(void *buf_opaque, char c) return 0; } +static void +lx_dynpop(void *buf_opaque) +{ + struct lx_dynbuf *t = buf_opaque; + + assert(t != NULL); + + assert(t->p != t->a); + t->p--; +} + int lx_dynclear(void *buf_opaque) { @@ -157,37 +193,36 @@ lx_dynfree(void *buf_opaque) static enum lx_token z0(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '/': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "\057" */ switch ((unsigned char) c) { @@ -243,79 +278,73 @@ z0(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return lx->z = z4, TOK_RE; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, TOK_RE; } break; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return lx->z = z4, TOK_RE; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_RE; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z1(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { - case '"': state = S2; break; - case '\\': state = S3; break; - default: state = S1; break; + case '\\': state = S1; break; + case '"': state = S3; break; + default: state = S2; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; - - case S2: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z4, TOK_STR; - - case S3: /* e.g. "\\" */ + case S1: /* e.g. "\\" */ switch ((unsigned char) c) { - case '"': - case '\\': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': state = S4; break; + case 'x': state = S4; break; case '0': case '1': case '2': @@ -324,29 +353,24 @@ z1(struct lx *lx) case '5': case '6': case '7': state = S5; break; - case 'x': state = S6; break; - default: lx_ungetc(lx, c); return TOK_CHAR; + case '"': + case '\\': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': state = S6; break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; } break; - case S4: /* e.g. "\\f" */ - lx_ungetc(lx, c); return TOK_ESC; + case S2: /* e.g. "\\x00" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; - case S5: /* e.g. "\\0" */ - switch ((unsigned char) c) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': break; - default: lx_ungetc(lx, c); return TOK_OCT; - } - break; + case S3: /* e.g. "\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, TOK_STR; - case S6: /* e.g. "\\x" */ + case S4: /* e.g. "\\x" */ switch ((unsigned char) c) { case '0': case '1': @@ -370,11 +394,30 @@ z1(struct lx *lx) case 'd': case 'e': case 'f': state = S7; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } + break; + + case S5: /* e.g. "\\0" */ + switch ((unsigned char) c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_OCT; } break; - case S7: /* e.g. "\\xa" */ + case S6: /* e.g. "\\\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_ESC; + + case S7: /* e.g. "\\x0" */ switch ((unsigned char) c) { case '0': case '1': @@ -398,135 +441,157 @@ z1(struct lx *lx) case 'd': case 'e': case 'f': break; - default: lx_ungetc(lx, c); return TOK_HEX; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_HEX; } break; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return TOK_CHAR; + case S3: return lx->z = z4, TOK_STR; + case S5: return TOK_OCT; + case S6: return TOK_ESC; + case S7: return TOK_HEX; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_STR; - case S3: return TOK_CHAR; - case S4: return TOK_ESC; - case S5: return TOK_OCT; - case S7: return TOK_HEX; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z2(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + + state = S0; + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\'': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return TOK_CHAR; + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CHAR; case S2: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z4, TOK_STR; + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, TOK_STR; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_CHAR; + case S2: return lx->z = z4, TOK_STR; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_CHAR; - case S2: return TOK_STR; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z3(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { case '\n': state = S2; break; default: state = S1; break; } break; - case S1: /* e.g. "a" */ - lx_ungetc(lx, c); return lx->z(lx); + case S1: /* e.g. "" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); case S2: /* e.g. "" */ - lx_ungetc(lx, c); return lx->z = z4, lx->z(lx); + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z4, lx->z(lx); default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_UNKNOWN; + case S2: return lx->z = z4, lx->z(lx); + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { case S0: @@ -536,75 +601,52 @@ z3(struct lx *lx) default: if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_EOF; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } static enum lx_token z4(struct lx *lx) { + int has_consumed_input = 0; int c; - enum { - S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, - S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, - S20, S21, S22, S23, S24, S25, S26, S27, S28, NONE - } state; - assert(lx != NULL); if (lx->clear != NULL) { lx->clear(lx->buf_opaque); } - state = NONE; - lx->start = lx->end; - while (c = lx_getc(lx), c != EOF) { - if (state == NONE) { - state = S0; - } + void *getc_opaque = (void *)lx; + enum { + S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, + S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, + S20, S21, S22, S23, S24, S25, S26, S27, S28 + } state; + state = S0; + + while (c = fsm_getc(getc_opaque), c != EOF) { + has_consumed_input = 1; switch (state) { - case S0: /* start */ + case S0: /* e.g. "" */ switch ((unsigned char) c) { - case '\t': - case '\n': - case '\r': - case ' ': state = S1; break; - case '!': state = S2; break; - case '"': state = S3; break; - case '#': state = S4; break; - case '$': state = S5; break; - case '&': state = S6; break; - case '\'': state = S7; break; - case '(': state = S8; break; - case ')': state = S9; break; - case '*': state = S10; break; - case '+': state = S11; break; - case ',': state = S12; break; - case '-': state = S13; break; - case '.': state = S14; break; - case '/': state = S15; break; - case ';': state = S16; break; - case '=': state = S17; break; - case '?': state = S18; break; + case ',': state = S1; break; + case '$': state = S2; break; case 'A': case 'B': case 'C': @@ -657,37 +699,42 @@ z4(struct lx *lx) case 'w': case 'x': case 'y': - case 'z': state = S19; break; - case '\\': state = S20; break; - case '^': state = S21; break; - case '{': state = S22; break; - case '|': state = S23; break; - case '}': state = S24; break; - case '~': state = S25; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; - } - break; - - case S1: /* e.g. "\\x09" */ - switch ((unsigned char) c) { + case 'z': state = S3; break; + case '&': state = S4; break; + case '|': state = S5; break; + case '.': state = S6; break; + case '-': state = S7; break; + case '\\': state = S8; break; + case '^': state = S9; break; + case '!': state = S10; break; + case '~': state = S11; break; + case '?': state = S12; break; + case '+': state = S13; break; + case '*': state = S14; break; + case ')': state = S15; break; + case '(': state = S16; break; + case '}': state = S17; break; + case '{': state = S18; break; + case ';': state = S19; break; + case '=': state = S20; break; + case '/': state = S21; break; + case '"': state = S22; break; + case '\'': state = S23; break; + case '#': state = S24; break; case '\t': case '\n': case '\r': - case ' ': break; - default: lx_ungetc(lx, c); return lx->z(lx); + case ' ': state = S25; break; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S2: /* e.g. "!" */ - lx_ungetc(lx, c); return TOK_BANG; - - case S3: /* e.g. "\"" */ - lx_ungetc(lx, c); return lx->z = z1, lx->z(lx); - - case S4: /* e.g. "#" */ - lx_ungetc(lx, c); return lx->z = z3, lx->z(lx); + case S1: /* e.g. "," */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_COMMA; - case S5: /* e.g. "$" */ + case S2: /* e.g. "$" */ switch ((unsigned char) c) { case 'A': case 'B': @@ -742,58 +789,13 @@ z4(struct lx *lx) case 'x': case 'y': case 'z': state = S28; break; - default: lx->lgetc = NULL; return TOK_UNKNOWN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; } break; - case S6: /* e.g. "&" */ - lx_ungetc(lx, c); return TOK_AND; - - case S7: /* e.g. "'" */ - lx_ungetc(lx, c); return lx->z = z2, lx->z(lx); - - case S8: /* e.g. "(" */ - lx_ungetc(lx, c); return TOK_LPAREN; - - case S9: /* e.g. ")" */ - lx_ungetc(lx, c); return TOK_RPAREN; - - case S10: /* e.g. "*" */ - lx_ungetc(lx, c); return TOK_STAR; - - case S11: /* e.g. "+" */ - lx_ungetc(lx, c); return TOK_CROSS; - - case S12: /* e.g. "," */ - lx_ungetc(lx, c); return TOK_COMMA; - - case S13: /* e.g. "-" */ - switch ((unsigned char) c) { - case '>': state = S27; break; - default: lx_ungetc(lx, c); return TOK_DASH; - } - break; - - case S14: /* e.g. "." */ - switch ((unsigned char) c) { - case '.': state = S26; break; - default: lx_ungetc(lx, c); return TOK_DOT; - } - break; - - case S15: /* e.g. "\057" */ - lx_ungetc(lx, c); return lx->z = z0, lx->z(lx); - - case S16: /* e.g. ";" */ - lx_ungetc(lx, c); return TOK_SEMI; - - case S17: /* e.g. "=" */ - lx_ungetc(lx, c); return TOK_BIND; - - case S18: /* e.g. "?" */ - lx_ungetc(lx, c); return TOK_QMARK; - - case S19: /* e.g. "a" */ + case S3: /* e.g. "A" */ switch ((unsigned char) c) { case '0': case '1': @@ -858,35 +860,98 @@ z4(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return TOK_IDENT; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_IDENT; } break; - case S20: /* e.g. "\\" */ - lx_ungetc(lx, c); return TOK_DASH; + case S4: /* e.g. "&" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_AND; + + case S5: /* e.g. "|" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_PIPE; + + case S6: /* e.g. "." */ + switch ((unsigned char) c) { + case '.': state = S27; break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_DOT; + } + break; + + case S7: /* e.g. "-" */ + switch ((unsigned char) c) { + case '>': state = S26; break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_DASH; + } + break; + + case S8: /* e.g. "\\" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_DASH; + + case S9: /* e.g. "^" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_HAT; - case S21: /* e.g. "^" */ - lx_ungetc(lx, c); return TOK_HAT; + case S10: /* e.g. "!" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_BANG; - case S22: /* e.g. "{" */ - lx_ungetc(lx, c); return TOK_OPEN; + case S11: /* e.g. "~" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TILDE; - case S23: /* e.g. "|" */ - lx_ungetc(lx, c); return TOK_PIPE; + case S12: /* e.g. "\077" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_QMARK; - case S24: /* e.g. "}" */ - lx_ungetc(lx, c); return TOK_CLOSE; + case S13: /* e.g. "+" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CROSS; - case S25: /* e.g. "~" */ - lx_ungetc(lx, c); return TOK_TILDE; + case S14: /* e.g. "*" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_STAR; - case S26: /* e.g. ".." */ - lx_ungetc(lx, c); return TOK_TO; + case S15: /* e.g. ")" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_RPAREN; - case S27: /* e.g. "->" */ - lx_ungetc(lx, c); return TOK_MAP; + case S16: /* e.g. "(" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_LPAREN; - case S28: /* e.g. "$a" */ + case S17: /* e.g. "}" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_CLOSE; + + case S18: /* e.g. "{" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_OPEN; + + case S19: /* e.g. ";" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_SEMI; + + case S20: /* e.g. "=" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_BIND; + + case S21: /* e.g. "\057" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z0, lx->z(lx); + + case S22: /* e.g. "\"" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z1, lx->z(lx); + + case S23: /* e.g. "'" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z2, lx->z(lx); + + case S24: /* e.g. "#" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z = z3, lx->z(lx); + + case S25: /* e.g. "\\x09" */ + switch ((unsigned char) c) { + case '\t': + case '\n': + case '\r': + case ' ': break; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return lx->z(lx); + } + break; + + case S26: /* e.g. "->" */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_MAP; + + case S27: /* e.g. ".." */ + lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TO; + + case S28: /* e.g. "$A" */ switch ((unsigned char) c) { case '0': case '1': @@ -951,66 +1016,73 @@ z4(struct lx *lx) case 'x': case 'y': case 'z': break; - default: lx_ungetc(lx, c); return TOK_TOKEN; + default: lx_ungetc(lx, c); lx_dynpop(lx->buf_opaque); return TOK_TOKEN; } break; default: ; /* unreached */ } + } + + /* end states */ + switch (state) { + case S1: return TOK_COMMA; + case S3: return TOK_IDENT; + case S4: return TOK_AND; + case S5: return TOK_PIPE; + case S6: return TOK_DOT; + case S7: return TOK_DASH; + case S8: return TOK_DASH; + case S9: return TOK_HAT; + case S10: return TOK_BANG; + case S11: return TOK_TILDE; + case S12: return TOK_QMARK; + case S13: return TOK_CROSS; + case S14: return TOK_STAR; + case S15: return TOK_RPAREN; + case S16: return TOK_LPAREN; + case S17: return TOK_CLOSE; + case S18: return TOK_OPEN; + case S19: return TOK_SEMI; + case S20: return TOK_BIND; + case S21: return lx->z = z0, lx->z(lx); + case S22: return lx->z = z1, lx->z(lx); + case S23: return lx->z = z2, lx->z(lx); + case S24: return lx->z = z3, lx->z(lx); + case S25: return TOK_EOF; + case S26: return TOK_MAP; + case S27: return TOK_TO; + case S28: return TOK_TOKEN; + default: + if (!has_consumed_input) { return TOK_EOF; } + lx_ungetc(lx, c); lx->lgetc = NULL; return TOK_UNKNOWN; + } switch (state) { - case S1: - case S3: - case S4: - case S7: - case S15: + case S21: + case S22: + case S23: + case S24: + case S25: break; default: if (lx->push != NULL) { - if (-1 == lx->push(lx->buf_opaque, c)) { + if (-1 == lx->push(lx->buf_opaque, (char)c)) { return TOK_ERROR; } } break; } - } lx->lgetc = NULL; - switch (state) { - case NONE: return TOK_EOF; - case S1: return TOK_EOF; - case S2: return TOK_BANG; - case S3: return TOK_EOF; - case S4: return TOK_EOF; - case S6: return TOK_AND; - case S7: return TOK_EOF; - case S8: return TOK_LPAREN; - case S9: return TOK_RPAREN; - case S10: return TOK_STAR; - case S11: return TOK_CROSS; - case S12: return TOK_COMMA; - case S13: return TOK_DASH; - case S14: return TOK_DOT; - case S15: return TOK_EOF; - case S16: return TOK_SEMI; - case S17: return TOK_BIND; - case S18: return TOK_QMARK; - case S19: return TOK_IDENT; - case S20: return TOK_DASH; - case S21: return TOK_HAT; - case S22: return TOK_OPEN; - case S23: return TOK_PIPE; - case S24: return TOK_CLOSE; - case S25: return TOK_TILDE; - case S26: return TOK_TO; - case S27: return TOK_MAP; - case S28: return TOK_TOKEN; - default: errno = EINVAL; return TOK_ERROR; - } + if (!has_consumed_input) { + return TOK_EOF; + } + return TOK_ERROR; } const char * @@ -1238,6 +1310,7 @@ lx_init(struct lx *lx) lx->end.byte = 0; lx->end.line = 1; lx->end.col = 1; + (void)lx_dynpop; } enum lx_token diff --git a/src/lx/main.c b/src/lx/main.c index ea16ee70d..92098bfd9 100644 --- a/src/lx/main.c +++ b/src/lx/main.c @@ -695,6 +695,10 @@ main(int argc, char *argv[]) opt.comments = 0; } + if (lang == LX_PRINT_C) { + opt.fragment = 1; + } + { if (print_progress) { fprintf(stderr, "-- parsing:"); diff --git a/src/lx/parser.c b/src/lx/parser.c index f6f759693..4ab69ce21 100644 --- a/src/lx/parser.c +++ b/src/lx/parser.c @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 127 "src/lx/parser.act" +#line 27 "src/lx/parser.act" #include @@ -182,7 +182,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-list */ { -#line 816 "src/lx/parser.act" +#line 814 "src/lx/parser.act" err_expected(lex_state, "list of mappings, bindings or zones"); @@ -204,7 +204,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) /* BEGINNING OF EXTRACT: IDENT */ { -#line 228 "src/lx/parser.act" +#line 227 "src/lx/parser.act" ZIn = xstrdup(lex_state->buf.a); @@ -214,7 +214,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: deref-var */ { -#line 280 "src/lx/parser.act" +#line 277 "src/lx/parser.act" struct ast_zone *z; @@ -252,7 +252,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) /* BEGINNING OF EXTRACT: TOKEN */ { -#line 224 "src/lx/parser.act" +#line 222 "src/lx/parser.act" /* TODO: submatch addressing */ ZIt = xstrdup(lex_state->buf.a + 1); /* +1 for '$' prefix */ @@ -263,7 +263,7 @@ p_pattern(lex_state lex_state, act_state act_state, zone ZIz, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: deref-token */ { -#line 308 "src/lx/parser.act" +#line 304 "src/lx/parser.act" const struct ast_mapping *m; fsm_state_t start; @@ -374,7 +374,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: op-reverse */ { -#line 677 "src/lx/parser.act" +#line 676 "src/lx/parser.act" assert((ZI210) != NULL); @@ -398,7 +398,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI231) != NULL); @@ -449,7 +449,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: op-complete */ { -#line 668 "src/lx/parser.act" +#line 667 "src/lx/parser.act" assert((ZI210) != NULL); @@ -473,7 +473,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI239) != NULL); @@ -509,7 +509,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z /* BEGINNING OF EXTRACT: IDENT */ { -#line 228 "src/lx/parser.act" +#line 227 "src/lx/parser.act" ZIn = xstrdup(lex_state->buf.a); @@ -563,7 +563,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI248) != NULL); @@ -614,7 +614,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: op-complement */ { -#line 659 "src/lx/parser.act" +#line 658 "src/lx/parser.act" assert((ZI210) != NULL); @@ -638,7 +638,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI223) != NULL); @@ -685,7 +685,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z /* BEGINNING OF EXTRACT: TOKEN */ { -#line 224 "src/lx/parser.act" +#line 222 "src/lx/parser.act" /* TODO: submatch addressing */ ZI253 = xstrdup(lex_state->buf.a + 1); /* +1 for '$' prefix */ @@ -696,7 +696,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z ADVANCE_LEXER; /* BEGINNING OF ACTION: deref-token */ { -#line 308 "src/lx/parser.act" +#line 304 "src/lx/parser.act" const struct ast_mapping *m; fsm_state_t start; @@ -769,7 +769,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI259) != NULL); @@ -829,7 +829,7 @@ p_list_Hof_Hthings_C_Cthing(lex_state lex_state, act_state act_state, ast ZIa, z } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI268) != NULL); @@ -869,7 +869,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-thing */ { -#line 812 "src/lx/parser.act" +#line 810 "src/lx/parser.act" err_expected(lex_state, "mapping, binding or zone"); @@ -899,7 +899,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-open */ { -#line 800 "src/lx/parser.act" +#line 798 "src/lx/parser.act" err_expected(lex_state, "'{'"); @@ -929,7 +929,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-close */ { -#line 804 "src/lx/parser.act" +#line 802 "src/lx/parser.act" err_expected(lex_state, "'}'"); @@ -955,7 +955,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: CHAR */ { -#line 219 "src/lx/parser.act" +#line 215 "src/lx/parser.act" assert(lex_state->buf.a[0] != '\0'); assert(lex_state->buf.a[1] == '\0'); @@ -972,7 +972,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: ESC */ { -#line 149 "src/lx/parser.act" +#line 143 "src/lx/parser.act" assert(lex_state->buf.a[0] == '\\'); assert(lex_state->buf.a[1] != '\0'); @@ -1000,7 +1000,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: HEX */ { -#line 212 "src/lx/parser.act" +#line 188 "src/lx/parser.act" unsigned long u; char *e; @@ -1037,7 +1037,7 @@ ZL2_pattern_C_Cbody:; { /* BEGINNING OF EXTRACT: OCT */ { -#line 185 "src/lx/parser.act" +#line 161 "src/lx/parser.act" unsigned long u; char *e; @@ -1077,7 +1077,7 @@ ZL2_pattern_C_Cbody:; /* END OF INLINE: 84 */ /* BEGINNING OF ACTION: pattern-char */ { -#line 249 "src/lx/parser.act" +#line 247 "src/lx/parser.act" /* TODO */ *lex_state->p++ = (ZIc); @@ -1089,7 +1089,7 @@ ZL2_pattern_C_Cbody:; goto ZL2_pattern_C_Cbody; /* END OF INLINE: pattern::body */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -1124,7 +1124,7 @@ ZL2_174:; } /* END OF INLINE: 174 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI171 = ZI168; @@ -1211,7 +1211,7 @@ ZL2_180:; } /* BEGINNING OF ACTION: op-alt */ { -#line 725 "src/lx/parser.act" +#line 724 "src/lx/parser.act" assert((ZI177) != NULL); assert((ZIb) != NULL); @@ -1230,7 +1230,7 @@ ZL2_180:; goto ZL2_180; /* END OF INLINE: 180 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI178 = ZI176; @@ -1291,7 +1291,7 @@ p_expr_C_Cprefix_Hexpr(lex_state lex_state, act_state act_state, zone ZIz, fsm * } /* BEGINNING OF ACTION: op-reverse */ { -#line 677 "src/lx/parser.act" +#line 676 "src/lx/parser.act" assert((ZIq) != NULL); @@ -1315,7 +1315,7 @@ p_expr_C_Cprefix_Hexpr(lex_state lex_state, act_state act_state, zone ZIz, fsm * } /* BEGINNING OF ACTION: op-complete */ { -#line 668 "src/lx/parser.act" +#line 667 "src/lx/parser.act" assert((ZIq) != NULL); @@ -1339,7 +1339,7 @@ p_expr_C_Cprefix_Hexpr(lex_state lex_state, act_state act_state, zone ZIz, fsm * } /* BEGINNING OF ACTION: op-complement */ { -#line 659 "src/lx/parser.act" +#line 658 "src/lx/parser.act" assert((ZIq) != NULL); @@ -1398,7 +1398,7 @@ ZL2_186:; } /* BEGINNING OF ACTION: op-intersect */ { -#line 714 "src/lx/parser.act" +#line 713 "src/lx/parser.act" assert((ZI183) != NULL); assert((ZIb) != NULL); @@ -1417,7 +1417,7 @@ ZL2_186:; goto ZL2_186; /* END OF INLINE: 186 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI184 = ZI182; @@ -1455,7 +1455,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hto_Hmappings_C_Clist_Hof_ } /* BEGINNING OF ACTION: op-alt */ { -#line 725 "src/lx/parser.act" +#line 724 "src/lx/parser.act" assert((ZIold_Hexit) != NULL); assert((ZInew_Hexit) != NULL); @@ -1473,7 +1473,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hto_Hmappings_C_Clist_Hof_ goto ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hto_Hmappings_C_Clist_Hof_Hzone_Hto_Hmappings_Hx; /* END OF INLINE: list-of-things::zone-thing::list-of-zone-to-mappings::list-of-zone-to-mappings-x */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -1554,7 +1554,7 @@ ZL2_197:; ADVANCE_LEXER; /* BEGINNING OF ACTION: op-cross */ { -#line 575 "src/lx/parser.act" +#line 568 "src/lx/parser.act" fsm_state_t start, end; fsm_state_t old; @@ -1607,13 +1607,13 @@ ZL2_197:; goto ZL2_197; /* END OF INLINE: 197 */ } - /*UNREACHED*/ + /* UNREACHED */ case (TOK_QMARK): { ADVANCE_LEXER; /* BEGINNING OF ACTION: op-qmark */ { -#line 620 "src/lx/parser.act" +#line 613 "src/lx/parser.act" fsm_state_t start, end; fsm_state_t old; @@ -1666,13 +1666,13 @@ ZL2_197:; goto ZL2_197; /* END OF INLINE: 197 */ } - /*UNREACHED*/ + /* UNREACHED */ case (TOK_STAR): { ADVANCE_LEXER; /* BEGINNING OF ACTION: op-star */ { -#line 525 "src/lx/parser.act" +#line 518 "src/lx/parser.act" fsm_state_t start, end; fsm_state_t old; @@ -1730,14 +1730,14 @@ ZL2_197:; goto ZL2_197; /* END OF INLINE: 197 */ } - /*UNREACHED*/ + /* UNREACHED */ default: goto ZL1; } } /* END OF INLINE: 272 */ } - /*UNREACHED*/ + /* UNREACHED */ default: { ZI196 = ZI191; @@ -1795,7 +1795,7 @@ p_204(lex_state lex_state, act_state act_state, zone *ZIz, fsm *ZI202, fsm *ZOq) } /* BEGINNING OF ACTION: op-subtract */ { -#line 703 "src/lx/parser.act" +#line 702 "src/lx/parser.act" assert((*ZI202) != NULL); assert((ZIb) != NULL); @@ -1870,7 +1870,7 @@ p_208(lex_state lex_state, act_state act_state, zone *ZIz, fsm *ZI206, fsm *ZOq) } /* BEGINNING OF ACTION: op-concat */ { -#line 686 "src/lx/parser.act" +#line 685 "src/lx/parser.act" assert((*ZI206) != NULL); assert((ZIb) != NULL); @@ -1920,7 +1920,7 @@ p_212(lex_state lex_state, act_state act_state, zone *ZIz, fsm *ZI210, fsm *ZOq) } /* BEGINNING OF ACTION: op-product */ { -#line 698 "src/lx/parser.act" +#line 696 "src/lx/parser.act" fprintf(stderr, "unimplemented\n"); (ZIq) = NULL; @@ -1960,7 +1960,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* BEGINNING OF EXTRACT: RE */ { -#line 236 "src/lx/parser.act" +#line 231 "src/lx/parser.act" assert(lex_state->buf.a[0] == '/'); @@ -1980,7 +1980,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: pattern-buffer */ { -#line 263 "src/lx/parser.act" +#line 252 "src/lx/parser.act" size_t len; @@ -2010,7 +2010,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: pattern-buffer */ /* BEGINNING OF ACTION: compile-regex */ { -#line 379 "src/lx/parser.act" +#line 376 "src/lx/parser.act" struct re_err err; @@ -2030,7 +2030,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: compile-regex */ /* BEGINNING OF ACTION: free-arr */ { -#line 766 "src/lx/parser.act" +#line 765 "src/lx/parser.act" free((ZIa)); @@ -2046,7 +2046,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) ADVANCE_LEXER; /* BEGINNING OF ACTION: pattern-buffer */ { -#line 263 "src/lx/parser.act" +#line 252 "src/lx/parser.act" size_t len; @@ -2076,7 +2076,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: pattern-buffer */ /* BEGINNING OF ACTION: compile-literal */ { -#line 364 "src/lx/parser.act" +#line 361 "src/lx/parser.act" struct re_err err; @@ -2096,7 +2096,7 @@ p_215(lex_state lex_state, act_state act_state, fsm *ZOr) /* END OF ACTION: compile-literal */ /* BEGINNING OF ACTION: free-arr */ { -#line 766 "src/lx/parser.act" +#line 765 "src/lx/parser.act" free((ZIa)); @@ -2164,7 +2164,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Ho } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZIr) != NULL); @@ -2189,7 +2189,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Ho /* END OF ACTION: subtract-exit */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2229,7 +2229,7 @@ ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Ho goto ZL2_list_Hof_Hthings_C_Czone_Hthing_C_Clist_Hof_Hzone_Hfrom_Hmappings_C_Clist_Hof_Hzone_Hfrom_Hmappings_Hx; /* END OF INLINE: list-of-things::zone-thing::list-of-zone-from-mappings::list-of-zone-from-mappings-x */ } - /*UNREACHED*/ + /* UNREACHED */ case (ERROR_TERMINAL): return; default: @@ -2256,7 +2256,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* BEGINNING OF ACTION: no-zone */ { -#line 515 "src/lx/parser.act" +#line 514 "src/lx/parser.act" (ZIparent) = NULL; @@ -2265,7 +2265,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: no-zone */ /* BEGINNING OF ACTION: make-ast */ { -#line 424 "src/lx/parser.act" +#line 423 "src/lx/parser.act" (ZIa) = ast_new(); if ((ZIa) == NULL) { @@ -2278,7 +2278,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: make-ast */ /* BEGINNING OF ACTION: make-zone */ { -#line 432 "src/lx/parser.act" +#line 431 "src/lx/parser.act" assert((ZIa) != NULL); @@ -2301,7 +2301,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: make-zone */ /* BEGINNING OF ACTION: no-exit */ { -#line 511 "src/lx/parser.act" +#line 510 "src/lx/parser.act" (ZIexit) = NULL; @@ -2310,7 +2310,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) /* END OF ACTION: no-exit */ /* BEGINNING OF ACTION: set-globalzone */ { -#line 500 "src/lx/parser.act" +#line 499 "src/lx/parser.act" assert((ZIa) != NULL); assert((ZIz) != NULL); @@ -2341,7 +2341,7 @@ p_lx(lex_state lex_state, act_state act_state, ast *ZOa) { /* BEGINNING OF ACTION: err-expected-eof */ { -#line 808 "src/lx/parser.act" +#line 806 "src/lx/parser.act" err_expected(lex_state, "EOF"); @@ -2358,7 +2358,7 @@ ZL1:; { /* BEGINNING OF ACTION: make-ast */ { -#line 424 "src/lx/parser.act" +#line 423 "src/lx/parser.act" (ZIa) = ast_new(); if ((ZIa) == NULL) { @@ -2371,7 +2371,7 @@ ZL1:; /* END OF ACTION: make-ast */ /* BEGINNING OF ACTION: err-syntax */ { -#line 776 "src/lx/parser.act" +#line 773 "src/lx/parser.act" err(lex_state, "Syntax error"); exit(EXIT_FAILURE); @@ -2433,7 +2433,7 @@ p_list_Hof_Hthings_C_Czone_Hthing_C_Czone_Hto_Hmapping(lex_state lex_state, act_ } /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2471,7 +2471,7 @@ p_list_Hof_Hthings_C_Czone_Hthing_C_Czone_Hto_Hmapping(lex_state lex_state, act_ /* END OF ACTION: add-mapping */ /* BEGINNING OF ACTION: clone */ { -#line 756 "src/lx/parser.act" +#line 755 "src/lx/parser.act" assert((ZIr) != NULL); @@ -2517,7 +2517,7 @@ p_112(lex_state lex_state, act_state act_state, string *ZOt) { /* BEGINNING OF ACTION: err-expected-map */ { -#line 784 "src/lx/parser.act" +#line 782 "src/lx/parser.act" err_expected(lex_state, "'->'"); @@ -2532,7 +2532,7 @@ p_112(lex_state lex_state, act_state act_state, string *ZOt) case (TOK_TOKEN): /* BEGINNING OF EXTRACT: TOKEN */ { -#line 224 "src/lx/parser.act" +#line 222 "src/lx/parser.act" /* TODO: submatch addressing */ ZIt = xstrdup(lex_state->buf.a + 1); /* +1 for '$' prefix */ @@ -2551,7 +2551,7 @@ p_112(lex_state lex_state, act_state act_state, string *ZOt) { /* BEGINNING OF ACTION: no-token */ { -#line 507 "src/lx/parser.act" +#line 506 "src/lx/parser.act" (ZIt) = NULL; @@ -2644,7 +2644,7 @@ ZL1:; { /* BEGINNING OF ACTION: err-expected-semi */ { -#line 792 "src/lx/parser.act" +#line 790 "src/lx/parser.act" err_expected(lex_state, "';'"); @@ -2695,7 +2695,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: no-zone */ { -#line 515 "src/lx/parser.act" +#line 514 "src/lx/parser.act" (ZIto) = NULL; @@ -2704,7 +2704,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-zone */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2749,7 +2749,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* BEGINNING OF ACTION: make-zone */ { -#line 432 "src/lx/parser.act" +#line 431 "src/lx/parser.act" assert((*ZIa) != NULL); @@ -2772,7 +2772,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: make-zone */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2829,7 +2829,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-to */ { -#line 796 "src/lx/parser.act" +#line 794 "src/lx/parser.act" err_expected(lex_state, "'..'"); @@ -2858,7 +2858,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: no-zone */ { -#line 515 "src/lx/parser.act" +#line 514 "src/lx/parser.act" (ZIx) = NULL; @@ -2867,7 +2867,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-zone */ /* BEGINNING OF ACTION: no-token */ { -#line 507 "src/lx/parser.act" +#line 506 "src/lx/parser.act" (ZIy) = NULL; @@ -2876,7 +2876,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-token */ /* BEGINNING OF ACTION: regex-any */ { -#line 395 "src/lx/parser.act" +#line 392 "src/lx/parser.act" fsm_state_t start, end; @@ -2912,7 +2912,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: regex-any */ /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZIw) != NULL); @@ -2937,7 +2937,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: subtract-exit */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -2997,7 +2997,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-list */ { -#line 816 "src/lx/parser.act" +#line 814 "src/lx/parser.act" err_expected(lex_state, "list of mappings, bindings or zones"); @@ -3017,7 +3017,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* BEGINNING OF ACTION: no-exit */ { -#line 511 "src/lx/parser.act" +#line 510 "src/lx/parser.act" (ZIr2) = NULL; @@ -3026,7 +3026,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: no-exit */ /* BEGINNING OF ACTION: make-zone */ { -#line 432 "src/lx/parser.act" +#line 431 "src/lx/parser.act" assert((*ZIa) != NULL); @@ -3049,7 +3049,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* END OF ACTION: make-zone */ /* BEGINNING OF ACTION: add-mapping */ { -#line 453 "src/lx/parser.act" +#line 449 "src/lx/parser.act" struct ast_token *t; struct ast_mapping *m; @@ -3101,7 +3101,7 @@ p_251(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-list */ { -#line 816 "src/lx/parser.act" +#line 814 "src/lx/parser.act" err_expected(lex_state, "list of mappings, bindings or zones"); @@ -3149,7 +3149,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit { /* BEGINNING OF ACTION: err-expected-bind */ { -#line 788 "src/lx/parser.act" +#line 786 "src/lx/parser.act" err_expected(lex_state, "'='"); @@ -3168,7 +3168,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: add-binding */ { -#line 485 "src/lx/parser.act" +#line 482 "src/lx/parser.act" struct var *v; @@ -3211,7 +3211,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit /* BEGINNING OF ACTION: deref-var */ { -#line 280 "src/lx/parser.act" +#line 277 "src/lx/parser.act" struct ast_zone *z; @@ -3254,7 +3254,7 @@ p_252(lex_state lex_state, act_state act_state, ast *ZIa, zone *ZIz, fsm *ZIexit } /* BEGINNING OF ACTION: subtract-exit */ { -#line 736 "src/lx/parser.act" +#line 735 "src/lx/parser.act" assert((ZI278) != NULL); @@ -3297,7 +3297,7 @@ ZL1:; /* BEGINNING OF TRAILER */ -#line 880 "src/lx/parser.act" +#line 818 "src/lx/parser.act" struct ast *lx_parse(FILE *f, const struct fsm_alloc *alloc) { diff --git a/src/lx/parser.h b/src/lx/parser.h index fdaff9879..947b194b5 100644 --- a/src/lx/parser.h +++ b/src/lx/parser.h @@ -9,7 +9,7 @@ /* BEGINNING OF HEADER */ -#line 139 "src/lx/parser.act" +#line 127 "src/lx/parser.act" #include @@ -29,7 +29,7 @@ extern void p_lx(lex_state, act_state, ast *); /* BEGINNING OF TRAILER */ -#line 882 "src/lx/parser.act" +#line 880 "src/lx/parser.act" #line 36 "src/lx/parser.h" diff --git a/src/lx/print/c.c b/src/lx/print/c.c index dae4bd937..3340f0d74 100644 --- a/src/lx/print/c.c +++ b/src/lx/print/c.c @@ -18,19 +18,17 @@ #include -#include "libfsm/internal.h" /* XXX */ -#include "libfsm/print/ir.h" /* XXX */ - #include "lx/lx.h" #include "lx/ast.h" #include "lx/print.h" -/* XXX: abstraction */ -int -fsm_print_cfrag(FILE *f, const struct ir *ir, - const struct fsm_options *opt, - const struct fsm_hooks *hooks, - const char *cp); +struct lx_hook_env { + const struct ast *ast; + /* Name of variable for the current character of input in the + * current scope, which depends on the IO options. */ + const char *cur_char_var; +}; + static int skip(const struct fsm *fsm, fsm_state_t state) @@ -38,7 +36,7 @@ skip(const struct fsm *fsm, fsm_state_t state) struct ast_mapping *m; assert(fsm != NULL); - assert(state < fsm->statecount); + assert(state < fsm_countstates(fsm)); if (!fsm_isend(fsm, state)) { return 1; @@ -100,7 +98,8 @@ shortest_example(const struct fsm *fsm, const struct ast_token *token, (void) fsm_getstart(fsm, &goal); min = INT_MAX; - for (i = 0; i < fsm->statecount; i++) { + const size_t statecount = fsm_countstates(fsm); + for (i = 0; i < statecount; i++) { const struct ast_mapping *m; int n; @@ -137,6 +136,51 @@ shortest_example(const struct fsm *fsm, const struct ast_token *token, return 1; } +static const char * +buf_op_prefix(void) +{ + if (api_tokbuf & API_FIXEDBUF) { + return "fixed"; + } else if (api_tokbuf & API_DYNBUF) { + return "dyn"; + } else { + assert(!"buf is neither fixed nor dyn"); + return NULL; + } +} + +static void +unget_character(FILE *f, bool pop, const char *cur_char_var) +{ + fprintf(f, "%sungetc(lx, %s); ", prefix.api, cur_char_var); + if (pop && (~api_exclude & API_BUF)) { + fprintf(f, "%s%spop(lx->buf_opaque); ", + prefix.api, buf_op_prefix()); + } +} + +static bool +endid_represents_dead_end(fsm_end_id_t endid, const struct ast *ast) +{ + const struct ast_mapping *m = ast_getendmappingbyendid(endid); + if (m == NULL) { + return false; + } + + /* For each zone, check if this endid is associated with its z->ml zone. + * If so, that endid is the "dead end" for that zone. + * + * The total number of zones and end ids (each corresponding to mapping) + * should stay small enough that linear search is fine. If this becomes + * prohibitively expensive, then build a bitset of dead-end IDs upfront + * in one pass. */ + for (struct ast_zone *z = ast->zl; z != NULL; z = z->next) { + if (z->ml == m) { return true; } + } + + return false; +} + static int accept_c(FILE *f, const struct fsm_options *opt, const struct fsm_state_metadata *state_metadata, @@ -144,6 +188,7 @@ accept_c(FILE *f, const struct fsm_options *opt, { const struct ast *ast; const struct ast_mapping *m; + struct lx_hook_env *env = hook_opaque; assert(f != NULL); assert(opt != NULL); @@ -152,28 +197,62 @@ accept_c(FILE *f, const struct fsm_options *opt, assert(lang_opaque == NULL); assert(hook_opaque != NULL); - ast = hook_opaque; + ast = env->ast; m = ast_getendmappingbyendid(state_metadata->end_ids[0]); - /* XXX: don't need this if complete */ - fprintf(f, "%sungetc(lx, c); ", prefix.api); - fprintf(f, "return "); - if (m->to != NULL) { - fprintf(f, "lx->z = z%u, ", zindexof(ast, m->to)); + /* re-sync before new call into zone */ + switch (opt->io) { + case FSM_IO_GETC: + break; + + case FSM_IO_STR: + case FSM_IO_PAIR: + fprintf(f, "lx->p = p; "); + break; } - if (m->token != NULL) { - fprintf(f, "%s", prefix.tok); - esctok(f, m->token->s); + + fprintf(f, "return "); + if (m->to == NULL) { + if (m->token == NULL) { + /* If accept-ing here doesn't actually map to a token or + * a different zone, then check whether the endid represents + * a dead end. In that case, it's stuck in the middle of a + * pattern pair like `'//' .. /\n/ -> $nl;` with an unexpected + * EOF, so tokenization should still fail (with TOK_UNKNOWN). + * + * An example where the endid doesn't represent a dead end is + * a zone ignoring trailing whitespace in a file, such as + * `/[\r\n\t ]+/;`. In that case, the EOF is valid, so still + * return TOK_EOF. */ + const fsm_end_id_t endid = state_metadata->end_ids[0]; + if (endid_represents_dead_end(endid, ast)) { + fprintf(f, "%sUNKNOWN", prefix.tok); + } else { + fprintf(f, "%sEOF", prefix.tok); + } + } else { + /* yield a token */ + fprintf(f, "%s", prefix.tok); + esctok(f, m->token->s); + } } else { - fprintf(f, "lx->z(lx)"); + if (m->token == NULL) { + /* update to a different zone, then call to it */ + fprintf(f, "lx->z = z%u, lx->z(lx)", zindexof(ast, m->to)); + } else { + /* update zone, then yield a token */ + fprintf(f, "lx->z = z%u, ", zindexof(ast, m->to)); + fprintf(f, "%s", prefix.tok); + esctok(f, m->token->s); + } } fprintf(f, ";"); - return 0; } static int reject_c(FILE *f, const struct fsm_options *opt, + const struct fsm_state_metadata *state_metadata, void *lang_opaque, void *hook_opaque) { assert(f != NULL); @@ -182,7 +261,58 @@ reject_c(FILE *f, const struct fsm_options *opt, assert(hook_opaque != NULL); (void) lang_opaque; - (void) hook_opaque; + struct lx_hook_env *env = hook_opaque; + + const struct ast_mapping *m = state_metadata != NULL && state_metadata->end_id_count > 0 + ? ast_getendmappingbyendid(state_metadata->end_ids[0]) + : NULL; + + /* If there is an AST mapping associated with this end state, + * then unget the previous character (in most cases), and + * possibly emit its token type and/or new z state. */ + if (m != NULL) { + const bool has_endids = state_metadata && state_metadata->end_id_count > 0; + if (m->token == NULL && m->to == NULL && !has_endids) { + unget_character(f, true, env->cur_char_var); + } else if (m->token == NULL && m->to == NULL && has_endids) { + unget_character(f, true, env->cur_char_var); + } else if (m->token == NULL && m->to != NULL) { + unget_character(f, true, env->cur_char_var); + } else if (m->token != NULL && m->to == NULL) { + assert(has_endids); + unget_character(f, true, env->cur_char_var); + } else if (m->token != NULL && m->to != NULL) { + unget_character(f, true, env->cur_char_var); + } + + /* re-sync before new call into zone */ + switch (opt->io) { + case FSM_IO_GETC: + break; + + case FSM_IO_STR: + case FSM_IO_PAIR: + fprintf(f, "lx->p = p; "); + break; + } + + fprintf(f, "return "); + if (m->to != NULL) { + fprintf(f, "lx->z = z%u, ", zindexof(env->ast, m->to)); + } + if (m->token != NULL) { + fprintf(f, "%s", prefix.tok); + esctok(f, m->token->s); + } else { + fprintf(f, "lx->z(lx)"); + } + fprintf(f, ";"); + return 0; + } else { + fprintf(f, "\n\t\t\t\tif (!has_consumed_input) { return %sEOF; }\n", prefix.tok); + fprintf(f, "\t\t\t\t"); + unget_character(f, false, env->cur_char_var); + } /* XXX: don't need this if complete */ switch (opt->io) { @@ -191,16 +321,36 @@ reject_c(FILE *f, const struct fsm_options *opt, break; case FSM_IO_STR: - fprintf(f, "lx->p = NULL; "); - break; - case FSM_IO_PAIR: fprintf(f, "lx->p = NULL; "); break; } fprintf(f, "return %sUNKNOWN;", prefix.tok); + return 0; +} +static int +advance_c(FILE *f, const struct fsm_options *opt, const char *cur_char_var, void *hook_opaque) +{ + (void)hook_opaque; + + fprintf(f, "\t\thas_consumed_input = 1;\n"); + + switch (opt->io) { + case FSM_IO_GETC: + break; + + case FSM_IO_STR: + case FSM_IO_PAIR: + /* When libfsm's generated code advances a character, update + * lx's token name buffer and position bookkeeping. */ + if (~api_exclude & API_POS) { + fprintf(f, "\t\tif (!%sadvance_end(lx, %s)) { return %sERROR; }\n", + prefix.api, cur_char_var, prefix.tok); + } + break; + } return 0; } @@ -216,20 +366,32 @@ print_proto(FILE *f, const struct ast *ast, const struct ast_zone *z) } static void -print_lgetc(FILE *f) +print_lgetc(FILE *f, const struct fsm_options *opt) { if (api_getc & API_FGETC) { if (print_progress) { fprintf(stderr, " fgetc"); } + if (opt->comments) { + fprintf(f, "/* Get a character from fgetc and push it to the buffer */\n"); + } fprintf(f, "int\n"); fprintf(f, "%sfgetc(struct %slx *lx)\n", prefix.api, prefix.lx); fprintf(f, "{\n"); fprintf(f, "\tassert(lx != NULL);\n"); fprintf(f, "\tassert(lx->getc_opaque != NULL);\n"); fprintf(f, "\n"); - fprintf(f, "\treturn fgetc(lx->getc_opaque);\n"); + + fprintf(f, "\tconst int c = fgetc(lx->getc_opaque);\n"); + fprintf(f, "\tif (c == EOF) {\n"); + fprintf(f, "\t\tlx->c = EOF;\n"); + fprintf(f, "\t\treturn EOF;\n"); + fprintf(f, "\t} else {\n"); + + fprintf(f, "\t\treturn c;\n"); + fprintf(f, "\t}\n"); + fprintf(f, "}\n"); fprintf(f, "\n"); } @@ -346,22 +508,67 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(stderr, " io"); } - /* TODO: consider passing char *c, and return int 0/-1 for error */ - fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); - fprintf(f, "inline\n"); - fprintf(f, "#endif\n"); - fprintf(f, "static int\n"); - fprintf(f, "lx_getc(struct %slx *lx)\n", prefix.lx); - fprintf(f, "{\n"); - fprintf(f, "\tint c;\n"); - fprintf(f, "\n"); + if (opt->io == FSM_IO_GETC || (~api_exclude & API_POS)) { + fprintf(f, "static int\n"); + fprintf(f, "%sadvance_end(struct %slx *lx, int c)\n", prefix.api, prefix.lx); + fprintf(f, "{\n"); - fprintf(f, "\tassert(lx != NULL);\n"); + if (api_exclude & API_POS) { + fprintf(f, "\t(void)lx; (void)c;\n"); + } else { + fprintf(f, "\tlx->end.byte++;\n"); + fprintf(f, "\tlx->end.col++;\n"); + + fprintf(f, "\tif (c == '\\n') {\n"); + fprintf(f, "\t\tlx->end.line++;\n"); + fprintf(f, "\t\tlx->end.saved_col = lx->end.col - 1;\n"); + fprintf(f, "\t\tlx->end.col = 1;\n"); + + if (opt->io == FSM_IO_STR) { /* ignore terminating '\0' */ + fprintf(f, "\t} else if (c == '\\0') { /* don't count terminating '\\0' */\n"); + fprintf(f, "\t\tlx->end.byte--;\n"); + fprintf(f, "\t\tlx->end.col--;\n"); + fprintf(f, "\t}\n"); + } else { + fprintf(f, "\t}\n"); + } + } + + if (api_exclude & API_BUF) { + fprintf(f, "\t(void)lx; (void)c;\n"); + } else { + fprintf(f, "\tif (lx->push != NULL) {\n"); + fprintf(f, "\t\tif (-1 == lx->push(lx->buf_opaque, (char)c)) {\n"); + fprintf(f, "\t\t\treturn 0;\n"); + fprintf(f, "\t\t}\n"); + fprintf(f, "\t}\n"); + } + + fprintf(f, "\treturn 1;\n"); + fprintf(f, "}\n"); + fprintf(f, "\n"); + } + + if (opt->io == FSM_IO_GETC) { + /* TODO: consider passing char *c, and return int 0/-1 for error */ + if (opt->comments) { + fprintf(f, "/* This wrapper manages one character of lookahead/pushback\n"); + fprintf(f, " * and the line, column, and byte offsets. */\n"); + } + fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); + fprintf(f, "inline\n"); + fprintf(f, "#endif\n"); + fprintf(f, "static int\n"); + fprintf(f, "%sgetc(struct %slx *lx)\n", prefix.api, prefix.lx); + fprintf(f, "{\n"); + fprintf(f, "\tint c;\n"); + fprintf(f, "\n"); + + fprintf(f, "\tassert(lx != NULL);\n"); - switch (opt->io) { - case FSM_IO_GETC: fprintf(f, "\tassert(lx->lgetc != NULL);\n"); fprintf(f, "\n"); + fprintf(f, "\tif (lx->c != EOF) {\n"); fprintf(f, "\t\tc = lx->c, lx->c = EOF;\n"); fprintf(f, "\t} else {\n"); @@ -371,54 +578,28 @@ print_io(FILE *f, const struct fsm_options *opt) fprintf(f, "\t\t}\n"); fprintf(f, "\t}\n"); fprintf(f, "\n"); - break; - case FSM_IO_STR: - /* - * For FSM_IO_STR we treat '\0' as the end of input, - * and so there's no need to distinguish it from EOF. - * We return '\0' here to save the assignment. - */ - fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\n"); - fprintf(f, "\tc = *lx->p++;\n"); + /* FIXME: This should distinguish between alloc failure + * and EOF, but will require layers of interface changes. */ + fprintf(f, "\tif (!%sadvance_end(lx, c)) { return EOF; }\n", prefix.api); fprintf(f, "\n"); - break; - case FSM_IO_PAIR: - fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\n"); - fprintf(f, "\tif (lx->p == lx->e) {\n"); - fprintf(f, "\t\t\treturn EOF;\n"); - fprintf(f, "\t}\n"); - fprintf(f, "\n"); - fprintf(f, "\tc = *lx->p++;\n"); + fprintf(f, "\treturn c;\n"); + fprintf(f, "}\n"); fprintf(f, "\n"); - break; - } - if (~api_exclude & API_POS) { - fprintf(f, "\tlx->end.byte++;\n"); - fprintf(f, "\tlx->end.col++;\n"); - fprintf(f, "\n"); - fprintf(f, "\tif (c == '\\n') {\n"); - fprintf(f, "\t\tlx->end.line++;\n"); - fprintf(f, "\t\tlx->end.saved_col = lx->end.col - 1;\n"); - fprintf(f, "\t\tlx->end.col = 1;\n"); - - if (opt->io == FSM_IO_STR) { /* ignore terminating '\0' */ - fprintf(f, "\t} else if (c == '\\0') { /* don't count terminating '\\0' */\n"); - fprintf(f, "\t\tlx->end.byte--;\n"); - fprintf(f, "\t\tlx->end.col--;\n"); - fprintf(f, "\t}\n"); - } else { - fprintf(f, "\t}\n"); - } + /* Add an implementation of fsm_getc that calls back + * into lx_getc with the lx handle. */ + fprintf(f, "/* This wrapper adapts calling %sgetc to the interface\n", prefix.api); + fprintf(f, " * in libfsm's generated code. */\n"); + fprintf(f, "static int\n"); + fprintf(f, "fsm_getc(void *getc_opaque)\n"); + fprintf(f, "{\n"); + + fprintf(f, "\treturn %sgetc((struct %slx *)getc_opaque);\n", prefix.api, prefix.lx); + fprintf(f, "}\n"); fprintf(f, "\n"); } - fprintf(f, "\treturn c;\n"); - fprintf(f, "}\n"); - fprintf(f, "\n"); fprintf(f, "#if __STDC_VERSION__ >= 199901L\n"); fprintf(f, "inline\n"); @@ -431,30 +612,21 @@ print_io(FILE *f, const struct fsm_options *opt) switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\tassert(lx->c == EOF);\n"); - fprintf(f, "\n"); fprintf(f, "\tlx->c = c;\n"); - fprintf(f, "\n"); break; case FSM_IO_STR: fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\tassert(*(lx->p - 1) == c);\n"); - fprintf(f, "\n"); - fprintf(f, "\tlx->p--;\n"); - fprintf(f, "\n"); break; case FSM_IO_PAIR: fprintf(f, "\tassert(lx->p != NULL);\n"); - fprintf(f, "\tassert(*(lx->p - 1) == c);\n"); - fprintf(f, "\n"); - fprintf(f, "\tlx->p--;\n"); - fprintf(f, "\n"); break; } - if (~api_exclude & API_POS) { - fprintf(f, "\n"); + if (api_exclude & API_POS) { + fprintf(f, "\t(void)lx; (void)c;\n"); + } else { fprintf(f, "\tlx->end.byte--;\n"); fprintf(f, "\tlx->end.col--;\n"); fprintf(f, "\n"); @@ -468,7 +640,7 @@ print_io(FILE *f, const struct fsm_options *opt) } static void -print_buf(FILE *f) +print_buf(FILE *f, const struct fsm_options *opt) { if (api_tokbuf & API_DYNBUF) { if (print_progress) { @@ -518,7 +690,28 @@ print_buf(FILE *f) fprintf(f, "}\n"); fprintf(f, "\n"); + + if ((~api_exclude & API_BUF) && (api_tokbuf & API_DYNBUF)) { + fprintf(f, "static void\n"); + fprintf(f, "%sdynpop(void *buf_opaque)\n", prefix.api); + fprintf(f, "{\n"); + fprintf(f, "\tstruct lx_dynbuf *t = buf_opaque;\n"); + fprintf(f, "\n"); + fprintf(f, "\tassert(t != NULL);\n"); + fprintf(f, "\n"); + + if (opt->io == FSM_IO_GETC) { + fprintf(f, "\tassert(t->p != t->a);\n"); + } + + fprintf(f, "\tt->p--;\n"); + + fprintf(f, "}\n"); + fprintf(f, "\n"); + } + fprintf(f, "int\n"); + /* FIXME: handle error from dynclear */ fprintf(f, "%sdynclear(void *buf_opaque)\n", prefix.api); fprintf(f, "{\n"); fprintf(f, "\tstruct lx_dynbuf *t = buf_opaque;\n"); @@ -542,6 +735,7 @@ print_buf(FILE *f) fprintf(f, "\n"); fprintf(f, "\tt->p = t->a;\n"); fprintf(f, "\n"); + fprintf(f, "\treturn 0;\n"); fprintf(f, "}\n"); fprintf(f, "\n"); @@ -582,6 +776,24 @@ print_buf(FILE *f) fprintf(f, "}\n"); fprintf(f, "\n"); + if (~api_exclude & API_BUF && (api_tokbuf & API_FIXEDBUF)) { + fprintf(f, "static void\n"); + fprintf(f, "%sfixedpop(void *buf_opaque)\n", prefix.api); + fprintf(f, "{\n"); + fprintf(f, "\tstruct lx_fixedbuf *t = buf_opaque;\n"); + fprintf(f, "\n"); + fprintf(f, "\tassert(t != NULL);\n"); + fprintf(f, "\tassert(t->p != NULL);\n"); + fprintf(f, "\tassert(t->a != NULL);\n"); + + if (opt->io == FSM_IO_GETC) { + fprintf(f, "\tassert(t->p > t->a);\n"); + } + fprintf(f, "\tt->p--;\n"); + fprintf(f, "}\n"); + fprintf(f, "\n"); + } + fprintf(f, "int\n"); fprintf(f, "%sfixedclear(void *buf_opaque)\n", prefix.api); fprintf(f, "{\n"); @@ -599,49 +811,44 @@ print_buf(FILE *f) } } -static void -print_stateenum(FILE *f, const struct fsm *fsm) -{ - fsm_state_t i; - - fprintf(f, "\tenum {\n"); - fprintf(f, "\t\t"); - - for (i = 0; i < fsm->statecount; i++) { - fprintf(f, "S%u, ", i); - - if (i + 1 < fsm->statecount && (i + 1) % 10 == 0) { - fprintf(f, "\n"); - fprintf(f, "\t\t"); - } - } - - fprintf(f, "NONE"); - - fprintf(f, "\n"); - fprintf(f, "\t} state;\n"); -} - static int print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, - const struct fsm_options *opt, const char *cp) + const struct fsm_options *opt, const char *cur_char_var) { assert(f != NULL); assert(z != NULL); assert(z->fsm != NULL); assert(fsm_all(z->fsm, fsm_isdfa)); assert(ast != NULL); - assert(cp != NULL); + assert(cur_char_var != NULL); - /* TODO: prerequisite that the FSM is a DFA */ + /* prerequisite that the FSM is a DFA */ + assert(fsm_all(z->fsm, fsm_isdfa)); fprintf(f, "static enum %stoken\n", prefix.api); fprintf(f, "z%u(struct %slx *lx)\n", zindexof(ast, z), prefix.lx); fprintf(f, "{\n"); - fprintf(f, "\tint c;\n"); - fprintf(f, "\n"); - print_stateenum(f, z->fsm); + /* This flag indicates whether the any of the input stream was + * consumed before getting EOF and skipping over the state and + * character logic expanded here. + * + * lx needs to track this for proper EOF handling. It previously + * generated the state enum itself, so that it could include an + * additional 'NONE' state. Inside the input loop, the default + * state of NONE would be updated to the start state, but if the + * input loop was skipped it would still be NONE. */ + fprintf(f, "\tint has_consumed_input = 0;\n"); + + switch (opt->io) { + case FSM_IO_GETC: + fprintf(f, "\tint c;\n"); + break; + case FSM_IO_STR: + case FSM_IO_PAIR: + break; + } + fprintf(f, "\n"); fprintf(f, "\tassert(lx != NULL);\n"); @@ -654,9 +861,6 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\n"); } - fprintf(f, "\tstate = NONE;\n"); - fprintf(f, "\n"); - if (~api_exclude & API_POS) { fprintf(f, "\tlx->start = lx->end;\n"); fprintf(f, "\n"); @@ -664,52 +868,38 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, switch (opt->io) { case FSM_IO_GETC: - fprintf(f, "\twhile (c = lx_getc(lx), c != EOF) {\n"); + fprintf(f, "\tvoid *getc_opaque = (void *)lx;\n"); + + /* This must be called with fragment sent, otherwise + * it will generate a nested function definition. */ + assert(opt->fragment); break; case FSM_IO_STR: - fprintf(f, "\twhile (c = lx_getc(lx), c != '\\0') {\n"); + fprintf(f, "const char *s = lx->p;\n"); + fprintf(f, "const char *p;\n"); break; case FSM_IO_PAIR: - fprintf(f, "\twhile (c = lx_getc(lx), c != EOF) {\n"); + fprintf(f, "\tconst char *p, *b = lx->p, *e = lx->e;\n"); break; } - { - fsm_state_t start; - - if (!fsm_getstart(z->fsm, &start)) { - errno = EINVAL; - return -1; - } - - fprintf(f, "\t\tif (state == NONE) {\n"); - fprintf(f, "\t\t\tstate = S%u;\n", start); - fprintf(f, "\t\t}\n"); - fprintf(f, "\n"); - } - { static const struct fsm_hooks defaults; struct fsm_hooks hooks = defaults; - struct ir *ir; - assert(cp != NULL); + struct lx_hook_env hook_env = { + .ast = ast, + .cur_char_var = cur_char_var, + }; hooks.accept = accept_c; hooks.reject = reject_c; - hooks.hook_opaque = (void *) ast; - - ir = make_ir(z->fsm, opt); - if (ir == NULL) { - /* TODO */ - } + hooks.advance = advance_c; + hooks.hook_opaque = &hook_env; - /* XXX: abstraction */ - (void) fsm_print_cfrag(f, ir, opt, &hooks, cp); - - free_ir(z->fsm, ir); + fsm_print(f, z->fsm, opt, &hooks, FSM_PRINT_C); } if (~api_exclude & API_BUF) { @@ -718,7 +908,8 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, has_skips = 0; - for (i = 0; i < z->fsm->statecount; i++) { + const size_t statecount = fsm_countstates(z->fsm); + for (i = 0; i < statecount; i++) { int r; r = fsm_reachableall(z->fsm, i, skip); @@ -740,7 +931,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\n"); fprintf(f, "\t\tswitch (state) {\n"); - for (i = 0; i < z->fsm->statecount; i++) { + for (i = 0; i < statecount; i++) { int r; r = fsm_reachableall(z->fsm, i, skip); @@ -760,7 +951,7 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, fprintf(f, "\t\tdefault:\n"); fprintf(f, "\t\t\tif (lx->push != NULL) {\n"); - fprintf(f, "\t\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cp); + fprintf(f, "\t\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cur_char_var); fprintf(f, "\t\t\t\t\treturn %sERROR;\n", prefix.tok); fprintf(f, "\t\t\t\t}\n"); fprintf(f, "\t\t\t}\n"); @@ -771,20 +962,16 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, } else { fprintf(f, "\n"); fprintf(f, "\t\tif (lx->push != NULL) {\n"); - fprintf(f, "\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cp); + fprintf(f, "\t\t\tif (-1 == lx->push(lx->buf_opaque, (char)%s)) {\n", cur_char_var); fprintf(f, "\t\t\t\treturn %sERROR;\n", prefix.tok); fprintf(f, "\t\t\t}\n"); fprintf(f, "\t\t}\n"); } } - fprintf(f, "\t}\n"); - fprintf(f, "\n"); { - fsm_state_t i; - switch (opt->io) { case FSM_IO_GETC: fprintf(f, "\tlx->lgetc = NULL;\n"); @@ -802,41 +989,11 @@ print_zone(FILE *f, const struct ast *ast, const struct ast_zone *z, break; } - fprintf(f, "\tswitch (state) {\n"); - - fprintf(f, "\tcase NONE: return %sEOF;\n", prefix.tok); - - for (i = 0; i < z->fsm->statecount; i++) { - const struct ast_mapping *m; - - if (!fsm_isend(z->fsm, i)) { - continue; - } - - m = ast_getendmapping(z->fsm, i); - if (LOG()) { - fprintf(stderr, "print_zone: ast_getendmapping for state %d: %p (c)\n", - i, (void *)m); - } - assert(m != NULL); - - fprintf(f, "\tcase S%u: return ", (unsigned) i); - - /* note: no point in changing zone here, because getc is now NULL */ - - if (m->token == NULL) { - fprintf(f, "%sEOF;\n", prefix.tok); - } else { - /* TODO: maybe make a printf-like little language to simplify this */ - fprintf(f, "%s", prefix.tok); - esctok(f, m->token->s); - fprintf(f, ";\n"); - } - } - - fprintf(f, "\tdefault: errno = EINVAL; return %sERROR;\n", prefix.tok); - - fprintf(f, "\t}\n"); + fprintf(f, "\tif (!has_consumed_input) {\n"); + fprintf(f, "\t\treturn %sEOF;\n", prefix.tok); + fprintf(f, "\t} \n"); + fprintf(f, "\treturn %sERROR;", prefix.tok); + fprintf(f, "\n"); } fprintf(f, "}\n\n"); @@ -967,8 +1124,8 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) switch (opt->io) { case FSM_IO_GETC: cp = "c"; break; - case FSM_IO_STR: cp = "c"; break; - case FSM_IO_PAIR: cp = "c"; break; + case FSM_IO_STR: cp = "*p"; break; + case FSM_IO_PAIR: cp = "*p"; break; } for (z = ast->zl; z != NULL; z = z->next) { @@ -1006,9 +1163,9 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\n"); print_io(f, opt); - print_lgetc(f); + print_lgetc(f, opt); - print_buf(f); + print_buf(f, opt); if (print_progress) { zn = 0; @@ -1041,6 +1198,7 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "void\n"); fprintf(f, "%sinit(struct %slx *lx)\n", prefix.api, prefix.lx); fprintf(f, "{\n"); + fprintf(f, "\tstatic const struct %slx lx_default;\n", prefix.lx); fprintf(f, "\n"); fprintf(f, "\tassert(lx != NULL);\n"); @@ -1048,16 +1206,8 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t*lx = lx_default;\n"); fprintf(f, "\n"); - switch (opt->io) { - case FSM_IO_GETC: + if (opt->io == FSM_IO_GETC) { fprintf(f, "\tlx->c = EOF;\n"); - break; - - case FSM_IO_STR: - break; - - case FSM_IO_PAIR: - break; } fprintf(f, "\tlx->z = z%u;\n", zindexof(ast, ast->global)); @@ -1067,6 +1217,16 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tlx->end.line = 1;\n"); fprintf(f, "\tlx->end.col = 1;\n"); } + + /* Suppress warning for possibly unused function */ + if (~api_exclude & API_BUF) { + if (api_tokbuf & API_FIXEDBUF) { + fprintf(f, "\t(void)%sfixedpop;\n", prefix.api); + } else if (api_tokbuf & API_DYNBUF) { + fprintf(f, "\t(void)%sdynpop;\n", prefix.api); + } + } + fprintf(f, "}\n"); fprintf(f, "\n"); } @@ -1088,9 +1248,6 @@ lx_print_c(FILE *f, const struct ast *ast, const struct fsm_options *opt) break; case FSM_IO_STR: - fprintf(f, "\tif (lx->p == NULL) {\n"); - break; - case FSM_IO_PAIR: fprintf(f, "\tif (lx->p == NULL) {\n"); break; diff --git a/src/lx/print/dump.c b/src/lx/print/dump.c index 162630f88..e693f9082 100644 --- a/src/lx/print/dump.c +++ b/src/lx/print/dump.c @@ -68,12 +68,12 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "main(int argc, char *argv[])\n"); fprintf(f, "{\n"); - fprintf(f, "\tenum lx_token t;\n"); - fprintf(f, "\tstruct lx lx = { 0 };\n"); + fprintf(f, "\tenum %stoken t;\n", prefix.api); + fprintf(f, "\tstruct %slx lx = { 0 };\n", prefix.lx); switch (opt->io) { case FSM_IO_GETC: - fprintf(f, "\tint (*lgetc)(struct lx *lx);\n"); + fprintf(f, "\tint (*lgetc)(struct %slx *lx);\n", prefix.lx); fprintf(f, "\tvoid *getc_opaque;\n"); break; @@ -135,7 +135,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) switch (api_getc) { case API_FGETC: - fprintf(f, "\tlgetc = lx_fgetc;\n"); + fprintf(f, "\tlgetc = %sfgetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = stdin;\n"); fprintf(f, "\n"); break; @@ -144,7 +144,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\ts = argv[1];\n"); fprintf(f, "\n"); - fprintf(f, "\tlgetc = lx_sgetc;\n"); + fprintf(f, "\tlgetc = %ssgetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = &s;\n"); fprintf(f, "\n"); break; @@ -154,7 +154,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tarr.len = strlen(arr.p);\n"); fprintf(f, "\n"); - fprintf(f, "\tlgetc = lx_agetc;\n"); + fprintf(f, "\tlgetc = %sagetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = &arr;\n"); fprintf(f, "\n"); break; @@ -167,13 +167,13 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\td.fd = fileno(stdin);\n"); fprintf(f, "\n"); - fprintf(f, "\tlgetc = lx_dgetc;\n"); + fprintf(f, "\tlgetc = %sdgetc;\n", prefix.api); fprintf(f, "\tgetc_opaque = &d;\n"); fprintf(f, "\n"); break; } - fprintf(f, "\tlx_init(&lx);\n"); + fprintf(f, "\t%sinit(&lx);\n", prefix.api); fprintf(f, "\n"); switch (opt->io) { @@ -201,10 +201,12 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tbuf.len = 0;\n"); fprintf(f, "\n"); - fprintf(f, "\tlx.buf_opaque = &buf;\n"); - fprintf(f, "\tlx.push = lx_dynpush;\n"); - fprintf(f, "\tlx.clear = lx_dynclear;\n"); - fprintf(f, "\tlx.free = lx_dynfree;\n"); + if (~api_exclude & API_BUF) { + fprintf(f, "\tlx.buf_opaque = &buf;\n"); + fprintf(f, "\tlx.push = %sdynpush;\n", prefix.api); + fprintf(f, "\tlx.clear = %sdynclear;\n", prefix.api); + fprintf(f, "\tlx.free = %sdynfree;\n", prefix.api); + } fprintf(f, "\n"); break; @@ -214,10 +216,12 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\tbuf.len = sizeof a;\n"); /* XXX: rename .len to .size */ fprintf(f, "\n"); - fprintf(f, "\tlx.buf_opaque = &buf;\n"); - fprintf(f, "\tlx.push = lx_fixedpush;\n"); - fprintf(f, "\tlx.clear = lx_fixedclear;\n"); - fprintf(f, "\tlx.free = NULL;\n"); + if (~api_exclude & API_BUF) { + fprintf(f, "\tlx.buf_opaque = &buf;\n"); + fprintf(f, "\tlx.push = %sfixedpush;\n", prefix.api); + fprintf(f, "\tlx.clear = %sfixedclear;\n", prefix.api); + fprintf(f, "\tlx.free = NULL;\n"); + } fprintf(f, "\n"); break; } @@ -227,7 +231,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t\tconst char *q;\n"); fprintf(f, "\n"); - fprintf(f, "\t\tt = lx_next(&lx);\n"); + fprintf(f, "\t\tt = %snext(&lx);\n", prefix.api); fprintf(f, "\n"); switch (api_tokbuf) { @@ -274,7 +278,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\n"); fprintf(f, "\t\tcase TOK_ERROR:\n"); - fprintf(f, "\t\t\tperror(\"lx_next\");\n"); + fprintf(f, "\t\t\tperror(\"%snext\");\n", prefix.api); fprintf(f, "\t\t\tbreak;\n"); fprintf(f, "\n"); @@ -287,7 +291,7 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t\tdefault:\n"); if (~api_exclude & API_NAME) { - fprintf(f, "\t\t\tprintf(\"<%%s\", lx_name(t));\n"); + fprintf(f, "\t\t\tprintf(\"<%%s\", %sname(t));\n", prefix.api); fprintf(f, "\t\t\tdump_buf(q, l);\n"); fprintf(f, "\t\t\tprintf(\">\\n\");\n"); } else { @@ -303,6 +307,11 @@ lx_print_dump(FILE *f, const struct ast *ast, const struct fsm_options *opt) fprintf(f, "\t} while (t != TOK_ERROR && t != TOK_EOF && t != TOK_UNKNOWN);\n"); fprintf(f, "\n"); + if (api_tokbuf == API_DYNBUF && (~api_exclude & API_BUF)) { + fprintf(f, "\tlx.free(lx.buf_opaque);\n"); + fprintf(f, "\n"); + } + fprintf(f, "\treturn t == TOK_ERROR;\n"); fprintf(f, "}\n"); } diff --git a/src/retest/main.c b/src/retest/main.c index 86a7474be..f752c6caa 100644 --- a/src/retest/main.c +++ b/src/retest/main.c @@ -56,7 +56,6 @@ struct match { struct match *next; }; -static int tty_output = 0; static int do_timing = 0; static int do_watchdog = 0; @@ -970,14 +969,6 @@ process_test_file(const char *filename, flagstring(flags, &flagdesc[0]); - if (tty_output) { - char *re = dup_str_esc(regexp, NULL); - printf("[ ] line %d: working on %s regexp /%s/%s ...\r", - linenum, dialect_name, re, flagdesc); - fflush(stdout); - free(re); - } - re_str = regexp; fsm = re_comp(dialect, fsm_sgetc, &re_str, alloc, flags, &err); if (fsm == NULL) { @@ -1231,17 +1222,6 @@ main(int argc, char *argv[]) int optlevel = 1; - /* is output to a tty or not? */ - int fileno_stdout = fileno(stdout); - if (fileno_stdout == -1) { - perror("fileno"); - } else { - tty_output = isatty(fileno_stdout); - if (tty_output == -1) { - perror("isatty"); - } - } - /* note these defaults are the opposite than for fsm(1) */ opt.anonymous_states = 1; opt.consolidate_edges = 1; diff --git a/src/rx/main.c b/src/rx/main.c index d3099a4d4..627280933 100644 --- a/src/rx/main.c +++ b/src/rx/main.c @@ -464,9 +464,6 @@ build_literals_fsm(bool show_stats, } } - /* We don't minimise here because this fsm has multiple endids, - * and the resulting FSM would be very similar to the current DFA */ - #ifndef NDEBUG /* * We could test to see that the fsm isn't any different. @@ -481,6 +478,9 @@ build_literals_fsm(bool show_stats, */ #endif + /* We don't minimise here because this fsm has multiple endids, + * and the resulting FSM would be very similar to the current DFA */ + return fsm; } diff --git a/tests/eager_output/eager_output7.c b/tests/eager_output/eager_output7.c index 94e9f1787..fe16e121e 100644 --- a/tests/eager_output/eager_output7.c +++ b/tests/eager_output/eager_output7.c @@ -2,7 +2,26 @@ int main(void) { + /* Run this test with env FORCE_ENDIDS=N ... to see how much more + * expensive it is to combine the first N patterns using endids, + * rather than eager_outputs. It becomes VERY slow for >= 9 or so. + * (Note that the checks probably will not pass for N < 4, because + * it will start skipping strings appearing in the early test inputs.) */ + bool force_endids = false; + size_t force_endid_count = 0; + { + const char *str = getenv("FORCE_ENDIDS"); + if (str != NULL) { + force_endid_count = atoi(str); + if (force_endid_count == 0) { + force_endid_count = 26; + } + force_endids = true; + } + } + struct eager_output_test test = { + .force_endids = force_endids, .patterns = { [0] = "apple", [1] = "banana", @@ -71,5 +90,15 @@ int main(void) }, }; + /* truncate patterns to the first N */ + if (force_endids) { + assert(force_endid_count > 0 && force_endid_count <= 26); + test.patterns[force_endid_count] = NULL; + + /* truncate test inputs to just the first couple, since + * later inputs use later patterns */ + test.inputs[5].input = NULL; + } + return run_test(&test); } diff --git a/tests/eager_output/eager_output_at_start.c b/tests/eager_output/eager_output_at_start.c new file mode 100644 index 000000000..8ba5f2ad1 --- /dev/null +++ b/tests/eager_output/eager_output_at_start.c @@ -0,0 +1,12 @@ +#include "utils.h" + +int main(void) +{ + struct eager_output_test test = { + .patterns = { "" }, + .inputs = { + { .input = "", .expected_ids = { 1 } }, + }, + }; + return run_test(&test); +} diff --git a/tests/eager_output/eager_output_mixed_anchored_unanchored.c b/tests/eager_output/eager_output_mixed_anchored_unanchored.c index 7afb272db..376b49d8a 100644 --- a/tests/eager_output/eager_output_mixed_anchored_unanchored.c +++ b/tests/eager_output/eager_output_mixed_anchored_unanchored.c @@ -2,9 +2,6 @@ int main(void) { - /* fprintf(stderr, "%s: skipping for now, this doesn't pass yet.\n", __FILE__); */ - /* return EXIT_SUCCESS; */ - struct eager_output_test test = { .patterns = { "^abc$", diff --git a/tests/eager_output/eager_output_mixed_start_anchor.c b/tests/eager_output/eager_output_mixed_start_anchor.c new file mode 100644 index 000000000..2965dc7aa --- /dev/null +++ b/tests/eager_output/eager_output_mixed_start_anchor.c @@ -0,0 +1,20 @@ +#include "utils.h" + +/* Regression: This is a case that requires an anchored_start state set + * rather than a single optional anchored_start state ID to link + * correctly. */ + +int main(void) +{ + struct eager_output_test test = { + .patterns = { + "(^|wax-)((?:banana|^apple))", + "(^|wax-)(orange)", + }, + .inputs = { + { .input = "banana", .expected_ids = { 1 } }, + }, + }; + + return run_test(&test); +} diff --git a/tests/eager_output/utils.c b/tests/eager_output/utils.c index dfd2b952b..d1ff4f7b4 100644 --- a/tests/eager_output/utils.c +++ b/tests/eager_output/utils.c @@ -65,7 +65,7 @@ run_test(const struct eager_output_test *test) const char *p = test->patterns[i]; if (test->patterns[i] == NULL) { break; } - struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, 0, NULL); + struct fsm *fsm = re_comp(RE_PCRE, fsm_sgetc, &p, NULL, RE_SAVE_LINKAGE_INFO, NULL); assert(fsm != NULL); if (log) { @@ -83,7 +83,16 @@ run_test(const struct eager_output_test *test) } const size_t id_base = 1; /* offset by 1 because 0 is used as end-of-list */ - struct fsm *fsm = fsm_union_repeated_pattern_group(nfas_used, nfas, NULL, id_base); + struct fsm *fsm; + if (test->force_endids) { + for (size_t i = 0; i < nfas_used; i++) { + fsm_setendid(nfas[i], i + id_base); + } + fsm = fsm_union_array(nfas_used, nfas, NULL); + } else { + /* This function sets the eager output IDs. */ + fsm = fsm_union_repeated_pattern_group(nfas_used, nfas, NULL, id_base); + } assert(fsm != NULL); if (log) { @@ -167,6 +176,7 @@ run_test(const struct eager_output_test *test) fsm_state_t end; /* only set on match */ ret = fsm_exec(fsm, fsm_sgetc, &input, &end, NULL); + size_t match_id_count = 0; if (ret == 1) { #define ENDID_BUF_SIZE 32 fsm_end_id_t endid_buf[ENDID_BUF_SIZE] = {0}; @@ -177,10 +187,26 @@ run_test(const struct eager_output_test *test) assert(!"fsm_endid_get failed"); } + match_id_count += outputs.used; + for (size_t e_i = 0; e_i < endid_count; e_i++) { + fsm_end_id_t endid = endid_buf[e_i]; + bool found = false; + for (size_t o_i = 0; o_i < outputs.used; o_i++) { + if (outputs.ids[o_i] == endid) { + found = true; + break; + } + } + if (!found) { + /* Don't count IDs set by both endids AND eager outputs twice. */ + match_id_count++; + } + } + /* Copy endid outputs into outputs.ids[], since for testing * purposes we don't care about the difference between eager * output and endids here. */ - assert(outputs.used + endid_count <= MAX_IDS); + assert(match_id_count <= MAX_IDS); for (size_t endid_i = 0; endid_i < endid_count; endid_i++) { if (log) { fprintf(stderr, "-- adding endid %zd: %d\n", endid_i, endid_buf[endid_i]); @@ -222,7 +248,7 @@ run_test(const struct eager_output_test *test) assert(ret == 1); } - assert(outputs.used >= expected_id_count); + assert(match_id_count == expected_id_count); size_t floor = 0; for (size_t exp_i = 0; exp_i < outputs.used; exp_i++) { diff --git a/tests/eager_output/utils.h b/tests/eager_output/utils.h index 02f8427c9..ee5f941c5 100644 --- a/tests/eager_output/utils.h +++ b/tests/eager_output/utils.h @@ -32,6 +32,7 @@ struct eager_output_test { const char *patterns[MAX_PATTERNS]; + bool force_endids; struct { const char *input; diff --git a/tests/lxpos/Makefile b/tests/lxpos/Makefile index da2263d24..abf11f58a 100644 --- a/tests/lxpos/Makefile +++ b/tests/lxpos/Makefile @@ -4,7 +4,7 @@ TEST.tests/lxpos != ls -1 tests/lxpos/out*.dump TEST_SRCDIR.tests/lxpos = tests/lxpos TEST_OUTDIR.tests/lxpos = ${BUILD}/tests/lxpos -LX?=${BUILD}/bin/lx +LX_BIN?=${BUILD}/bin/lx # for lx -l test LEXER += tests/lxpos/lexer.lx @@ -30,14 +30,14 @@ CFLAGS.${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump += -I ${BUILD}/tests/lxpos .for ext in c h -${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.${ext}: tests/lxpos/lexer.lx - ${LX} -l ${ext} ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ +${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.${ext}: tests/lxpos/lexer.lx ${LX_BIN} + ${LX_BIN} -l ${ext} ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ || { rm -f $@; false; } .endfor -${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump.c: tests/lxpos/lexer.lx - ${LX} -l dump ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ +${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump.c: tests/lxpos/lexer.lx ${LX_BIN} + ${LX_BIN} -l dump ${LX_CFLAGS} ${LX_CFLAGS.tests/lxpos/${buf}-${getc}-${io}-lexer.lx} < ${.ALLSRC:M*.lx} > $@ \ || { rm -f $@; false; } ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump: ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-lexer.h @@ -53,11 +53,11 @@ CLEAN += ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump ${TEST_OUTDIR.tests/lxpos}/${buf}-${getc}-${io}-got${n}.dump: ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump ${TEST_SRCDIR.tests/lxpos}/in${n}.txt .if ${getc} != none - cat ${.ALLSRC:M*.txt} getcio=${io} \ + cat ${.ALLSRC:M*.txt} \ | ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump \ > $@ .else - cat ${.ALLSRC:M*.txt} io=${io} \ + cat ${.ALLSRC:M*.txt} \ | xargs -0 ${BUILD}/tests/lxpos/${buf}-${getc}-${io}-dump \ > $@ .endif diff --git a/tests/lxpos/in10.txt b/tests/lxpos/in10.txt new file mode 100644 index 000000000..52cab2a4f --- /dev/null +++ b/tests/lxpos/in10.txt @@ -0,0 +1 @@ +d: r * t /*distance = rate * time* diff --git a/tests/lxpos/in8.txt b/tests/lxpos/in8.txt new file mode 100644 index 000000000..e5c01f9dd --- /dev/null +++ b/tests/lxpos/in8.txt @@ -0,0 +1 @@ +hello ` diff --git a/tests/lxpos/in9.txt b/tests/lxpos/in9.txt new file mode 100644 index 000000000..81d7792df --- /dev/null +++ b/tests/lxpos/in9.txt @@ -0,0 +1 @@ +d: r * t /*distance = rate * time diff --git a/tests/lxpos/out0.dump b/tests/lxpos/out0.dump index 347bb87dc..8f1a4ac1d 100644 --- a/tests/lxpos/out0.dump +++ b/tests/lxpos/out0.dump @@ -2,4 +2,4 @@ 6-12:1,7-13: 12-13:1-2,13-1: -12-13:1-2,13-1: +13:2,1: diff --git a/tests/lxpos/out1.dump b/tests/lxpos/out1.dump index bf4d9bcb9..f3432e1e3 100644 --- a/tests/lxpos/out1.dump +++ b/tests/lxpos/out1.dump @@ -6,4 +6,4 @@ 19-25:2,7-13: 25-26:2-3,13-1: -25-26:2-3,13-1: +26:3,1: diff --git a/tests/lxpos/out10.dump b/tests/lxpos/out10.dump new file mode 100644 index 000000000..4314c8beb --- /dev/null +++ b/tests/lxpos/out10.dump @@ -0,0 +1,7 @@ +0-1:1,1-2: +1-2:1,2-3: +3-4:1,4-5: +5-6:1,6-7: +7-8:1,8-9: +34-35:1-2,35-1: lexically uncategorised: ' +' diff --git a/tests/lxpos/out2.dump b/tests/lxpos/out2.dump index cf5aa7013..bdd9e90a6 100644 --- a/tests/lxpos/out2.dump +++ b/tests/lxpos/out2.dump @@ -9,4 +9,4 @@ 12-13:1,13-14: 13-14:1-2,14-1: -13-14:1-2,14-1: +14:2,1: diff --git a/tests/lxpos/out3.dump b/tests/lxpos/out3.dump index 92d9e915a..e7f0f1e70 100644 --- a/tests/lxpos/out3.dump +++ b/tests/lxpos/out3.dump @@ -5,4 +5,4 @@ 7-8:1,8-9: 35-36:1-2,36-1: -35-36:1-2,36-1: +36:2,1: diff --git a/tests/lxpos/out4.dump b/tests/lxpos/out4.dump index 6161924b0..c3bb9bb2d 100644 --- a/tests/lxpos/out4.dump +++ b/tests/lxpos/out4.dump @@ -8,4 +8,4 @@ 27-28:1-2,28-1: 28-29:2,1-2: -28-29:2,1-2: +29:2,2: diff --git a/tests/lxpos/out5.dump b/tests/lxpos/out5.dump index e2506a1df..040b6734a 100644 --- a/tests/lxpos/out5.dump +++ b/tests/lxpos/out5.dump @@ -7,4 +7,4 @@ 4-5:3,1-2: 5-6:3-4,2-1: -5-6:3-4,2-1: +6:4,1: diff --git a/tests/lxpos/out6.dump b/tests/lxpos/out6.dump index 8036b2ad4..c11d09988 100644 --- a/tests/lxpos/out6.dump +++ b/tests/lxpos/out6.dump @@ -1,2 +1,2 @@ 0-1:1,1-2: -4-5:1,5-6: +4-5:1,5-6: lexically uncategorised: ' ' diff --git a/tests/lxpos/out7.dump b/tests/lxpos/out7.dump index 7eeab8bcb..0706b7981 100644 --- a/tests/lxpos/out7.dump +++ b/tests/lxpos/out7.dump @@ -2,4 +2,4 @@ 17-18:2,12-13: 18-19:2-3,13-1: -18-19:2-3,13-1: +19:3,1: diff --git a/tests/lxpos/out8.dump b/tests/lxpos/out8.dump new file mode 100644 index 000000000..3300df2f6 --- /dev/null +++ b/tests/lxpos/out8.dump @@ -0,0 +1,3 @@ +0-5:1,1-6: +6-7:1,7-8: lexically uncategorised: '` +' diff --git a/tests/lxpos/out9.dump b/tests/lxpos/out9.dump new file mode 100644 index 000000000..1f3cf25bc --- /dev/null +++ b/tests/lxpos/out9.dump @@ -0,0 +1,7 @@ +0-1:1,1-2: +1-2:1,2-3: +3-4:1,4-5: +5-6:1,6-7: +7-8:1,8-9: +33-34:1-2,34-1: lexically uncategorised: ' +' diff --git a/tests/re_interpolate_groups/Makefile b/tests/re_interpolate_groups/Makefile new file mode 100644 index 000000000..41f9e2599 --- /dev/null +++ b/tests/re_interpolate_groups/Makefile @@ -0,0 +1,23 @@ +.include "../../share/mk/top.mk" + +TEST.tests/re_interpolate_groups != ls -1 tests/re_interpolate_groups/re_interpolate_groups*.c +TEST_SRCDIR.tests/re_interpolate_groups = tests/re_interpolate_groups +TEST_OUTDIR.tests/re_interpolate_groups = ${BUILD}/tests/re_interpolate_groups + +.for n in ${TEST.tests/re_interpolate_groups:T:R:C/^re_interpolate_groups//} +test:: ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n} +SRC += ${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c +#CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -UNDEBUG +CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -std=c99 + +${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o + ${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o ${BUILD}/src/libre/re_interpolate_groups.o + +${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} + ( ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n} + +#.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +#${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${BUILD}/lib/${lib:R}.a +#.endfor +.endfor + diff --git a/tests/re_interpolate_groups/re_interpolate_groups0.c b/tests/re_interpolate_groups/re_interpolate_groups0.c new file mode 100644 index 000000000..4b4e0f709 --- /dev/null +++ b/tests/re_interpolate_groups/re_interpolate_groups0.c @@ -0,0 +1,72 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include + +#include +#include + +static unsigned failed; + +static void +test(const char *fmt, size_t groupc, const char *groupv[], const char *expected) +{ + char outs[40]; + bool r; + + assert(fmt != NULL); + assert(expected != NULL); + + if (!re_interpolate_groups(fmt, '$', "", groupc, groupv, "", outs, sizeof outs, NULL, NULL)) { + printf("%s/%zu XXX\n", fmt, groupc); + failed++; + return; + } + + failed += r = 0 != strcmp(outs, expected); + + printf("%s/%zu => %s%s\n", fmt, groupc, outs, + r ? " XXX" : ""); +} + +int main(void) { + const char *gn[] = { "one", "two", "three", "four" }; + const char **g0 = NULL; + const char *ga[] = { "1" }; + const char *gb[] = { "" }; +// const char *gc[] = { NULL }; // XXX: not permitted + + test("", 0, g0, ""); + test("", 4, gn, ""); + + test("x", 0, g0, "x"); + test("x", 4, gn, "x"); + + test("\001", 0, g0, "\001"); + test("\001", 4, gn, "\001"); + + test("$0", 0, gn, ""); + test("x$000000000000000000000x", 0, gn, "xx"); + test("x$000000000000000000001x", 1, gn, "xonex"); + test("x$100000000000000000000x", 1, gn, "xx"); + + test("$$$1$1$2$1$3$4$3$2$1$$$$", 4, gn, "$oneonetwoonethreefourthreetwoone$$"); + test("$$$$$$$$$$$$$$$$$$$$", 4, gn, "$$$$$$$$$$"); + + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 4, gn, "xyz_one..three;three,$.one-four="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 3, gn, "xyz_one..three;three,$.one-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 2, gn, "xyz_one..;,$.one-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gn, "xyz_one..;,$.one-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, g0, "xyz_..;,$.-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, ga, "xyz_1..;,$.1-="); + test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gb, "xyz_..;,$.-="); + + return failed; +} + diff --git a/tests/re_interpolate_groups/re_interpolate_groups1.c b/tests/re_interpolate_groups/re_interpolate_groups1.c new file mode 100644 index 000000000..01dc3b344 --- /dev/null +++ b/tests/re_interpolate_groups/re_interpolate_groups1.c @@ -0,0 +1,75 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include + +#include +#include + +static unsigned failed; + +static void +test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, + unsigned expected_start, unsigned expected_end) +{ + struct re_pos start, end; + char outs[10]; + bool rs, re; + + assert(fmt != NULL); + + outs[0] = 'x'; + + /* for these tests we're expecting to error */ + if (re_interpolate_groups(fmt, '$', "", groupc, groupv, ne, outs, sizeof outs, &start, &end)) { + printf("%s/%zu XXX\n", fmt, groupc); + failed++; + return; + } + + if (outs[0] != '\0') { + failed++; + } + + failed += rs = expected_start != start.byte; + failed += re = expected_end != end.byte; + + printf("%s/%zu => :%u-%u :%u-%u '%.*s'%s\n", fmt, groupc, + start.byte, end.byte, + expected_start, expected_end, + (int) (end.byte - start.byte), fmt + start.byte, + (rs || re) ? " XXX" : ""); +} + +int main(void) { + const char *ne = ""; + + const char *gn[] = { "one", "two", "three", "four" }; + const char **g0 = NULL; + + test_err("$", 0, g0, ne, 0, 1); + test_err("$x", 0, g0, ne, 0, 1); + test_err("$ ", 4, gn, ne, 0, 1); + test_err("$\\01", 0, g0, ne, 0, 1); + + test_err("$0$", 0, g0, ne, 2, 3); + test_err("$$$x", 4, gn, ne, 2, 3); + + test_err("xyz$1", 0, gn, NULL, 3, 5); + test_err("xyz$2", 1, gn, NULL, 3, 5); + + test_err("01234567890", 1, gn, ne, 0, 10); + test_err("$$$$$$$$$$$$$$$$$$$$", 1, gn, ne, 0, 20); + test_err("$1$1$1$$", 1, gn, ne, 0, 8); + test_err("$1$1$1x", 1, gn, ne, 0, 7); + test_err("xxxyyyzzz$$", 1, gn, ne, 0, 11); + + return failed; +} + diff --git a/tests/re_interpolate_groups/re_interpolate_groups2.c b/tests/re_interpolate_groups/re_interpolate_groups2.c new file mode 100644 index 000000000..d33fe656a --- /dev/null +++ b/tests/re_interpolate_groups/re_interpolate_groups2.c @@ -0,0 +1,40 @@ +/* + * Copyright 2026 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include +#include + +#include +#include + +static unsigned failed; + +static void +test(const char *fmt, bool expected) +{ + bool r; + + assert(fmt != NULL); + + r = re_interpolate_groups(fmt, '$', "", 0, NULL, "", NULL, 0, NULL, NULL); + + failed += r != expected; + + printf("%s/%d => %d%s\n", fmt, 0, r, + r != expected ? " XXX" : ""); +} + +int main(void) { + test("", true); + test("abc", true); + test("$$", true); + test("$x", false); + + return failed; +} + diff --git a/tests/regressions/Makefile b/tests/regressions/Makefile new file mode 100644 index 000000000..624353ef6 --- /dev/null +++ b/tests/regressions/Makefile @@ -0,0 +1,20 @@ +.include "../../share/mk/top.mk" + +TEST.tests/regressions != ls -1 tests/regressions/regressions*.c +TEST_SRCDIR.tests/regressions = tests/regressions +TEST_OUTDIR.tests/regressions = ${BUILD}/tests/regressions + +.for n in ${TEST.tests/regressions:T:R:C/^regressions//} +INCDIR.${TEST_SRCDIR.tests/regressions}/regressions${n}.c += src/adt +.endfor + +.for n in ${TEST.tests/regressions:T:R:C/^regressions//} +test:: ${TEST_OUTDIR.tests/regressions}/res${n} +SRC += ${TEST_SRCDIR.tests/regressions}/regressions${n}.c +CFLAGS.${TEST_SRCDIR.tests/regressions}/regressions${n}.c += -UNDEBUG + +${TEST_OUTDIR.tests/regressions}/run${n}: ${TEST_OUTDIR.tests/regressions}/regressions${n}.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a + ${CC} ${CFLAGS} ${CFLAGS.${TEST_SRCDIR.tests/regressions}/regressions${n}.c} -o ${TEST_OUTDIR.tests/regressions}/run${n} ${TEST_OUTDIR.tests/regressions}/regressions${n}.o ${BUILD}/lib/libfsm.a ${BUILD}/lib/libre.a +${TEST_OUTDIR.tests/regressions}/res${n}: ${TEST_OUTDIR.tests/regressions}/run${n} + ( ${TEST_OUTDIR.tests/regressions}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/regressions}/res${n} +.endfor diff --git a/tests/regressions/regressions_determinise_state_limit_leak.c b/tests/regressions/regressions_determinise_state_limit_leak.c new file mode 100644 index 000000000..afdfbd9c5 --- /dev/null +++ b/tests/regressions/regressions_determinise_state_limit_leak.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include + +static const char *strings[] = { + [0] = "apple", + [1] = "banana", + [2] = "carrot", + [3] = "durian", + [4] = "eggplant", +}; +#define STRING_COUNT sizeof(strings)/sizeof(strings[0]) + +int main(void) +{ + struct fsm *fsms[STRING_COUNT] = {0}; + + for (size_t i = 0; i < STRING_COUNT; i++) { + fsms[i] = re_comp(RE_PCRE, fsm_sgetc, &strings[i], NULL, 0, NULL); + assert(fsms[i] != NULL); + } + + struct fsm *combined_fsm = fsm_union_array(STRING_COUNT, fsms, NULL); + assert(combined_fsm != NULL); + + size_t state_limit_base = fsm_countstates(combined_fsm); + size_t max_state_limit = state_limit_base + 100; + + bool hit_state_limit = false; + + for (size_t state_limit = state_limit_base; state_limit < max_state_limit; state_limit += 10) { + struct fsm *cp = fsm_clone(combined_fsm); + + const struct fsm_determinise_config det_config = { + .state_limit = state_limit, + }; + + /* Previously this would leak memory when hitting the STATE_LIMIT_REACHED + * early exit, because the edge sets for the DFA being constructed were + * not freed properly. + * + * The first time this should fail immediately because the state limit IS the starting size, + * but later on it should halt in the middle of construction. */ + switch (fsm_determinise_with_config(cp, &det_config)) { + case FSM_DETERMINISE_WITH_CONFIG_OK: + fsm_free(cp); + break; + case FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED: + hit_state_limit = true; + fsm_free(cp); + break; + case FSM_DETERMINISE_WITH_CONFIG_ERRNO: + assert(!"internal error"); + return EXIT_FAILURE; + } + } + + assert(hit_state_limit); + + fsm_free(combined_fsm); + return EXIT_SUCCESS; +}