tokenizer: Add backtick delimited strings

This is useful when writing strings that contain a lot of backslashes,
e.g. in regular expressions.
This commit is contained in:
Laria 2023-11-07 21:40:02 +01:00
parent 85bafd0ef9
commit aa6346eafa
6 changed files with 121 additions and 11 deletions

View file

@ -75,6 +75,8 @@ The following escape sequences are recognized:
Also, a `'` followed by a variable name evaluates to a string with the variable name as it's content. `'foo-bar` and `"foo-bar"` are equivalent.
Additionally a string can also be delimited by backticks (`` ` ``). Inside the backticks `\`-escapes are not interpreted. To add a plain backtick here, double it to escape it. For example, ``` `foo``bar\baz` ``` and ```"foo`bar\\baz" ``` are equivalent.
### Lists
A list is created by `[]` with optional expressions inside the brackets. The expressions can be separated by commas (`,`), newlines, semicolons (`;`) or simply whitespace.

View file

@ -8,6 +8,8 @@ scope: source.apfl
contexts:
main:
- match: '`'
push: backtick_string
- match: '"'
push: string
- match: "#.*$"
@ -35,3 +37,9 @@ contexts:
scope: constant.character.escape.apfl
- match: '"'
pop: true
backtick_string:
- meta_scope: string.quoted.backtick.apfl
- match: "``"
scope: constant.character.escape.apfl
- match: '`(?!`)'
pop: true

View file

@ -14,11 +14,11 @@ match := { ~args ->
}
}
match "^f(o*)" "foooo"
match "^f(o*)" "FoOoo"
match "^f(o*)"::'i "FoOoo"
match "^f(o*)" "f"
match "^f(o*)" 1 "foooo"
match `^f(o*)` "foooo"
match `^f(o*)` "FoOoo"
match `^f(o*)`::'i "FoOoo"
match `^f(o*)` "f"
match `^f(o*)` 1 "foooo"
print ""
@ -30,7 +30,7 @@ replace "(x+)" { _ x -> len x } ""
replace "(x+)" { _ x -> len x } "xxx"
replace "(x+)" { _ x -> len x } "xxxyxyxx"
replace "(x+)" { _ x -> len x } 2 "xxxyxyxx"
replace "\\[(\\w+)\\]" "<$1>" "[foo] [bar]"
replace `\[(\w+)\]` "<$1>" "[foo] [bar]"
print ""
@ -41,8 +41,8 @@ match-all := { ~args ->
}
}
match-all "f(\\w+)" "afoobar fizz abcdefg f"
match-all "^(\\w+)\\s*=\\s*(.*)$"::'m "foo= bar\nbar = 123"
match-all `f(\w+)` "afoobar fizz abcdefg f"
match-all `^(\w+)\s*=\s*(.*)$`::'m "foo= bar\nbar = 123"
===== output =====
-- match --

View file

@ -44,7 +44,7 @@
replacement?(has type 'string) ->
m := nil
parts = []
while { m = match "^(.*?)\\$(\\d)(.*)$" replacement } {
while { m = match `^(.*?)\$(\d)(.*)$` replacement } {
[_ head n replacement] = m
parts = [~parts Plain::head Var::(tonumber n)]
}

View file

@ -165,6 +165,7 @@ yield_simple_token(
static enum apfl_parse_result comment(apfl_tokenizer_ptr);
static enum apfl_parse_result colon(apfl_tokenizer_ptr);
static enum apfl_parse_result string(apfl_tokenizer_ptr);
static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr);
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char);
static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool);
static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool);
@ -276,6 +277,8 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position);
case '\'':
return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position);
case '`':
return backtick_string(tokenizer);
case '#':
return comment(tokenizer);
case ':':
@ -533,6 +536,78 @@ string(apfl_tokenizer_ptr tokenizer)
return out;
}
static enum apfl_parse_result
inner_backtick_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
{
struct apfl_position pos = tokenizer->position;
unsigned char byte;
for (;;) {
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
if (byte != '`') {
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
continue;
}
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
goto finalize;
}
if (byte == '`') {
if (!apfl_string_builder_append_byte(text, '`')) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
continue;
}
unread_byte(tokenizer);
goto finalize;
}
finalize:
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_STRING,
.position = pos,
.text = apfl_string_builder_move_string(text),
};
return APFL_PARSE_OK;
}
static enum apfl_parse_result
backtick_string(apfl_tokenizer_ptr tokenizer)
{
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
enum apfl_parse_result out = inner_backtick_string(tokenizer, &text);
apfl_string_builder_deinit(&text);
return out;
}
static enum apfl_parse_result
finalize_maybe_name(
apfl_tokenizer_ptr tokenizer,
@ -612,6 +687,7 @@ maybe_name_inner(
case '#':
case ':':
case '"':
case '`':
case ' ':
case '\r':
case '\t':

View file

@ -265,8 +265,10 @@ TEST(all_tokens, t) {
"@ . ? ~ -> = :=\n"
// 1234567
"({[]})\n"
// 1234567
": :: :="
// 12345678
": :: :=\n"
// 1234567890
"`foo``bar`"
);
expect_text_token (tt, 1, 1, APFL_TOK_COMMENT, " test");
@ -306,6 +308,8 @@ TEST(all_tokens, t) {
expect_simple_token(tt, 7, 1, APFL_TOK_COLON);
expect_simple_token(tt, 7, 3, APFL_TOK_DOUBLE_COLON);
expect_simple_token(tt, 7, 6, APFL_TOK_LOCAL_ASSIGN);
expect_simple_token(tt, 7, 8, APFL_TOK_LINEBREAK);
expect_text_token (tt, 8, 1, APFL_TOK_STRING, "foo`bar");
expect_eof(tt);
@ -325,6 +329,25 @@ TEST(strings_with_binary_data, t) {
destroy_tokenizer_test(tt);
}
TEST(backtick_strings, t) {
struct tokenizer_test *tt = new_tokenizer_test(
t,
// 1234567890123456789012345 6789 1
"`foo`bar``baz```` `ab``c\"d'e\nf`"
);
expect_text_token (tt, 1, 1, APFL_TOK_STRING, "foo");
expect_text_token (tt, 1, 6, APFL_TOK_NAME, "bar");
expect_text_token (tt, 1, 9, APFL_TOK_STRING, "");
expect_text_token (tt, 1, 11, APFL_TOK_NAME, "baz");
expect_text_token (tt, 1, 14, APFL_TOK_STRING, "`");
expect_text_token (tt, 1, 19, APFL_TOK_STRING, "ab`c\"d'e\nf");
expect_eof(tt);
destroy_tokenizer_test(tt);
}
TEST(err_invalid_bytes, t) {
struct tokenizer_test *tt = new_tokenizer_test(t, "\x05" "foo\x01_bar\x7F" "baz");
expect_error(tt, APFL_ERR_UNEXPECTED_BYTE);
@ -345,5 +368,6 @@ TESTS_BEGIN
ADDTEST(assignment),
ADDTEST(all_tokens),
ADDTEST(strings_with_binary_data),
ADDTEST(backtick_strings),
ADDTEST(err_invalid_bytes),
TESTS_END