tokenizer: Add backtick delimited strings

This is useful when writing strings that contain a lot of backslashes, e.g. in regular expressions.
2023-11-07 21:40:02 +01:00 · 2023-11-07 21:40:02 +01:00 · aa6346eafa
commit aa6346eafa
parent 85bafd0ef9
6 changed files with 121 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -75,6 +75,8 @@ The following escape sequences are recognized:

 Also, a `'` followed by a variable name evaluates to a string with the variable name as it's content. `'foo-bar` and `"foo-bar"` are equivalent.

+Additionally a string can also be delimited by backticks (`` ` ``). Inside the backticks `\`-escapes are not interpreted. To add a plain backtick here, double it to escape it. For example, ``` `foo``bar\baz` ``` and ```"foo`bar\\baz" ``` are equivalent.
+
 ### Lists

 A list is created by `[]` with optional expressions inside the brackets. The expressions can be separated by commas (`,`), newlines, semicolons (`;`) or simply whitespace.
--- a/contrib/SublimeTextPackage/Apfl.sublime-syntax
+++ b/contrib/SublimeTextPackage/Apfl.sublime-syntax
@ -8,6 +8,8 @@ scope: source.apfl

 contexts:
  main:
+    - match: '`'
+      push: backtick_string
    - match: '"'
      push: string
    - match: "#.*$"
@ -35,3 +37,9 @@ contexts:
      scope: constant.character.escape.apfl
    - match: '"'
      pop: true
+  backtick_string:
+    - meta_scope: string.quoted.backtick.apfl
+    - match: "``"
+      scope: constant.character.escape.apfl
+    - match: '`(?!`)'
+      pop: true
--- a/src/functional-tests/re.at
+++ b/src/functional-tests/re.at
@ -14,11 +14,11 @@ match := { ~args ->
    }
 }

-match "^f(o*)" "foooo"
-match "^f(o*)" "FoOoo"
-match "^f(o*)"::'i "FoOoo"
-match "^f(o*)" "f"
-match "^f(o*)" 1 "foooo"
+match `^f(o*)` "foooo"
+match `^f(o*)` "FoOoo"
+match `^f(o*)`::'i "FoOoo"
+match `^f(o*)` "f"
+match `^f(o*)` 1 "foooo"

 print ""

@ -30,7 +30,7 @@ replace "(x+)" { _ x -> len x } ""
 replace "(x+)" { _ x -> len x } "xxx"
 replace "(x+)" { _ x -> len x } "xxxyxyxx"
 replace "(x+)" { _ x -> len x } 2 "xxxyxyxx"
-replace "\\[(\\w+)\\]" "<$1>" "[foo] [bar]"
+replace `\[(\w+)\]` "<$1>" "[foo] [bar]"

 print ""

@ -41,8 +41,8 @@ match-all := { ~args ->
    }
 }

-match-all "f(\\w+)" "afoobar fizz abcdefg f"
-match-all "^(\\w+)\\s*=\\s*(.*)$"::'m "foo= bar\nbar    = 123"
+match-all `f(\w+)` "afoobar fizz abcdefg f"
+match-all `^(\w+)\s*=\s*(.*)$`::'m "foo= bar\nbar    = 123"

 ===== output =====
 -- match --
--- a/src/re.apfl
+++ b/src/re.apfl
@ -44,7 +44,7 @@
    replacement?(has type 'string) ->
        m := nil
        parts = []
-        while { m = match "^(.*?)\\$(\\d)(.*)$" replacement } {
+        while { m = match `^(.*?)\$(\d)(.*)$` replacement } {
            [_ head n replacement] = m
            parts = [~parts Plain::head Var::(tonumber n)]
        }
--- a/src/tokenizer.c
+++ b/src/tokenizer.c
@ -165,6 +165,7 @@ yield_simple_token(
 static enum apfl_parse_result comment(apfl_tokenizer_ptr);
 static enum apfl_parse_result colon(apfl_tokenizer_ptr);
 static enum apfl_parse_result string(apfl_tokenizer_ptr);
+static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr);
 static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char);
 static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool);
 static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool);
@ -276,6 +277,8 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
            return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position);
        case '\'':
            return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position);
+        case '`':
+            return backtick_string(tokenizer);
        case '#':
            return comment(tokenizer);
        case ':':
@ -533,6 +536,78 @@ string(apfl_tokenizer_ptr tokenizer)
    return out;
 }

+static enum apfl_parse_result
+inner_backtick_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
+{
+    struct apfl_position pos = tokenizer->position;
+
+    unsigned char byte;
+
+    for (;;) {
+        switch (read_byte(tokenizer, &byte, true)) {
+        case RR_OK:
+            break;
+        case RR_ERR:
+            return APFL_PARSE_ERROR;
+        case RR_EOF:
+            tokenizer->next_mode = NM_EOF;
+            tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
+            return APFL_PARSE_ERROR;
+        }
+
+        if (byte != '`') {
+            if (!apfl_string_builder_append_byte(text, byte)) {
+                tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
+                return APFL_PARSE_ERROR;
+            }
+            continue;
+        }
+
+        switch (read_byte(tokenizer, &byte, true)) {
+        case RR_OK:
+            break;
+        case RR_ERR:
+            return APFL_PARSE_ERROR;
+        case RR_EOF:
+            tokenizer->next_mode = NM_EOF;
+            goto finalize;
+        }
+
+        if (byte == '`') {
+            if (!apfl_string_builder_append_byte(text, '`')) {
+                tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
+                return APFL_PARSE_ERROR;
+            }
+            continue;
+        }
+
+        unread_byte(tokenizer);
+
+        goto finalize;
+    }
+
+finalize:
+    tokenizer->token = (struct apfl_token) {
+        .type = APFL_TOK_STRING,
+        .position = pos,
+        .text = apfl_string_builder_move_string(text),
+    };
+    return APFL_PARSE_OK;
+}
+
+static enum apfl_parse_result
+backtick_string(apfl_tokenizer_ptr tokenizer)
+{
+    struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
+
+    enum apfl_parse_result out = inner_backtick_string(tokenizer, &text);
+
+    apfl_string_builder_deinit(&text);
+
+    return out;
+}
+
+
 static enum apfl_parse_result
 finalize_maybe_name(
    apfl_tokenizer_ptr tokenizer,
@ -612,6 +687,7 @@ maybe_name_inner(
        case '#':
        case ':':
        case '"':
+        case '`':
        case ' ':
        case '\r':
        case '\t':
--- a/src/tokenizer_test.c
+++ b/src/tokenizer_test.c
@ -265,8 +265,10 @@ TEST(all_tokens, t) {
        "@ . ? ~ -> = :=\n"
    //   1234567
        "({[]})\n"
-    //   1234567
-        ": :: :="
+    //   12345678
+        ": :: :=\n"
+    //   1234567890
+        "`foo``bar`"
    );

    expect_text_token  (tt, 1, 1,  APFL_TOK_COMMENT, " test");
@ -306,6 +308,8 @@ TEST(all_tokens, t) {
    expect_simple_token(tt, 7, 1,  APFL_TOK_COLON);
    expect_simple_token(tt, 7, 3,  APFL_TOK_DOUBLE_COLON);
    expect_simple_token(tt, 7, 6,  APFL_TOK_LOCAL_ASSIGN);
+    expect_simple_token(tt, 7, 8,  APFL_TOK_LINEBREAK);
+    expect_text_token  (tt, 8, 1,  APFL_TOK_STRING, "foo`bar");

    expect_eof(tt);

@ -325,6 +329,25 @@ TEST(strings_with_binary_data, t) {
    destroy_tokenizer_test(tt);
 }

+TEST(backtick_strings, t) {
+    struct tokenizer_test *tt = new_tokenizer_test(
+        t,
+    //   1234567890123456789012345 6789 1
+        "`foo`bar``baz```` `ab``c\"d'e\nf`"
+    );
+
+    expect_text_token  (tt, 1, 1,  APFL_TOK_STRING, "foo");
+    expect_text_token  (tt, 1, 6,  APFL_TOK_NAME, "bar");
+    expect_text_token  (tt, 1, 9,  APFL_TOK_STRING, "");
+    expect_text_token  (tt, 1, 11, APFL_TOK_NAME, "baz");
+    expect_text_token  (tt, 1, 14, APFL_TOK_STRING, "`");
+    expect_text_token  (tt, 1, 19, APFL_TOK_STRING, "ab`c\"d'e\nf");
+
+    expect_eof(tt);
+
+    destroy_tokenizer_test(tt);
+}
+
 TEST(err_invalid_bytes, t) {
    struct tokenizer_test *tt = new_tokenizer_test(t, "\x05" "foo\x01_bar\x7F" "baz");
    expect_error(tt, APFL_ERR_UNEXPECTED_BYTE);
@ -345,5 +368,6 @@ TESTS_BEGIN
    ADDTEST(assignment),
    ADDTEST(all_tokens),
    ADDTEST(strings_with_binary_data),
+    ADDTEST(backtick_strings),
    ADDTEST(err_invalid_bytes),
 TESTS_END