apfl/src/tokenizer.c
Laria Carolin Chabowski aa6346eafa tokenizer: Add backtick delimited strings
This is useful when writing strings that contain a lot of backslashes,
e.g. in regular expressions.
2023-11-07 21:45:27 +01:00

872 lines
24 KiB
C

#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include "apfl.h"
#include "alloc.h"
#include "parsing.h"
#define BUFSIZE 4096
typedef int buf_offset;
static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset");
static_assert(BUFSIZE >= 2, "BUFSIZE must be at least 2");
struct apfl_tokenizer {
struct apfl_allocator allocator;
struct apfl_source_reader source_reader;
unsigned char *buf;
buf_offset buf_pos;
buf_offset buf_len;
enum {
NM_REGULAR,
NM_MAPSTO,
NM_ASSIGN,
NM_EOF,
} next_mode;
struct apfl_position pos_for_mapsto;
struct apfl_position position;
struct apfl_position last_position;
bool last_byte_was_linebreak;
bool prev_last_byte_was_linebreak;
union {
struct apfl_token token;
struct apfl_error error;
};
};
apfl_tokenizer_ptr
apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_source_reader source_reader)
{
apfl_tokenizer_ptr tokenizer = ALLOC_OBJ(allocator, struct apfl_tokenizer);
if (tokenizer == NULL) {
return NULL;
}
tokenizer->allocator = allocator;
tokenizer->source_reader = source_reader;
if ((tokenizer->buf = ALLOC_BYTES(allocator, BUFSIZE)) == NULL) {
FREE_OBJ(allocator, tokenizer);
return NULL;
}
tokenizer->buf_pos = 0;
tokenizer->buf_len = 0;
tokenizer->position = (struct apfl_position) {
.line = 1,
.col = 0, // The first character was not yet read
};
tokenizer->last_byte_was_linebreak = false;
tokenizer->prev_last_byte_was_linebreak = false;
tokenizer->next_mode = NM_REGULAR;
return tokenizer;
}
void
apfl_tokenizer_destroy(apfl_tokenizer_ptr tokenizer)
{
if (tokenizer == NULL) {
return;
}
FREE_BYTES(tokenizer->allocator, tokenizer->buf, BUFSIZE);
FREE_OBJ(tokenizer->allocator, tokenizer);
}
struct apfl_token
apfl_tokenizer_get_token(apfl_tokenizer_ptr tokenizer)
{
return tokenizer->token;
}
struct apfl_error
apfl_tokenizer_get_error(apfl_tokenizer_ptr tokenizer)
{
return tokenizer->error;
}
static enum read_result
read_byte(apfl_tokenizer_ptr tokenizer, unsigned char *byte, bool need)
{
if (tokenizer->buf_pos >= tokenizer->buf_len) {
size_t off = 0;
if (tokenizer->buf_len > 0) {
off = 1;
tokenizer->buf[0] = tokenizer->buf[tokenizer->buf_len - 1];
}
size_t len = BUFSIZE - off;
tokenizer->buf_pos = off;
tokenizer->buf_len = off;
if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf+off, &len, need)) {
tokenizer->error.type = APFL_ERR_INPUT_ERROR;
return RR_ERR;
}
tokenizer->buf_len = len + off;
if (len == 0) {
return RR_EOF;
}
}
tokenizer->prev_last_byte_was_linebreak = tokenizer->last_byte_was_linebreak;
tokenizer->last_position = tokenizer->position;
if (tokenizer->last_byte_was_linebreak) {
tokenizer->position.line++;
tokenizer->position.col = 0;
}
*byte = tokenizer->buf[tokenizer->buf_pos];
tokenizer->buf_pos++;
tokenizer->last_byte_was_linebreak = (*byte == '\n');
tokenizer->position.col++;
return RR_OK;
}
// Only at most 1 unread_byte() call is allowed after a read_byte() call!
static void
unread_byte(apfl_tokenizer_ptr tokenizer)
{
tokenizer->position = tokenizer->last_position;
tokenizer->last_byte_was_linebreak = tokenizer->prev_last_byte_was_linebreak;
assert(tokenizer->buf_pos > 0);
tokenizer->buf_pos--;
}
static enum apfl_parse_result
yield_simple_token(
apfl_tokenizer_ptr tokenizer,
enum apfl_token_type type,
struct apfl_position pos
) {
tokenizer->token.type = type;
tokenizer->token.position = pos;
return APFL_PARSE_OK;
}
static enum apfl_parse_result comment(apfl_tokenizer_ptr);
static enum apfl_parse_result colon(apfl_tokenizer_ptr);
static enum apfl_parse_result string(apfl_tokenizer_ptr);
static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr);
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char);
static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool);
static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool);
static bool
is_control_byte(unsigned char byte)
{
return byte < 0x20 || byte == 0x7F;
}
static enum apfl_parse_result
minus(apfl_tokenizer_ptr tokenizer)
{
struct apfl_position pos = tokenizer->position;
unsigned char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
struct apfl_string str = apfl_string_blank();
if (!apfl_string_copy(tokenizer->allocator, &str, apfl_string_view_from("-"))) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_NAME,
.position = pos,
.text = str,
};
return APFL_PARSE_OK;
}
switch (byte) {
case '0':
return zero(tokenizer, pos, true);
case '>':
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, pos);
default:
unread_byte(tokenizer);
if (isdigit(byte)) {
return number(tokenizer, 10, pos, true);
} else {
return maybe_name(tokenizer, true, '-');
}
}
}
enum apfl_parse_result
apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
{
switch (tokenizer->next_mode) {
case NM_REGULAR:
break;
case NM_MAPSTO:
tokenizer->next_mode = NM_REGULAR;
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto);
case NM_ASSIGN:
tokenizer->next_mode = NM_REGULAR;
return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position);
case NM_EOF:
return APFL_PARSE_EOF;
}
unsigned char byte;
for (;;) {
switch (read_byte(tokenizer, &byte, need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
return APFL_PARSE_EOF;
}
switch (byte) {
case '(':
return yield_simple_token(tokenizer, APFL_TOK_LPAREN, tokenizer->position);
case ')':
return yield_simple_token(tokenizer, APFL_TOK_RPAREN, tokenizer->position);
case '[':
return yield_simple_token(tokenizer, APFL_TOK_LBRACKET, tokenizer->position);
case ']':
return yield_simple_token(tokenizer, APFL_TOK_RBRACKET, tokenizer->position);
case '{':
return yield_simple_token(tokenizer, APFL_TOK_LBRACE, tokenizer->position);
case '}':
return yield_simple_token(tokenizer, APFL_TOK_RBRACE, tokenizer->position);
case '~':
return yield_simple_token(tokenizer, APFL_TOK_EXPAND, tokenizer->position);
case '.':
return yield_simple_token(tokenizer, APFL_TOK_DOT, tokenizer->position);
case '@':
return yield_simple_token(tokenizer, APFL_TOK_AT, tokenizer->position);
case ';':
return yield_simple_token(tokenizer, APFL_TOK_SEMICOLON, tokenizer->position);
case '\n':
return yield_simple_token(tokenizer, APFL_TOK_LINEBREAK, tokenizer->position);
case '\\':
return yield_simple_token(tokenizer, APFL_TOK_CONTINUE_LINE, tokenizer->position);
case ',':
return yield_simple_token(tokenizer, APFL_TOK_COMMA, tokenizer->position);
case '?':
return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position);
case '\'':
return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position);
case '`':
return backtick_string(tokenizer);
case '#':
return comment(tokenizer);
case ':':
return colon(tokenizer);
case '"':
return string(tokenizer);
case '-':
return minus(tokenizer);
case ' ':
case '\r':
case '\t':
// Skip whitespace
break;
case '0':
return zero(tokenizer, tokenizer->position, false);
default:
if (is_control_byte(byte)) {
// Disallow ASCII control characters here
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
} else if (isdigit(byte)) {
struct apfl_position position = tokenizer->position;
unread_byte(tokenizer);
return number(tokenizer, 10, position, false);
} else {
return maybe_name(tokenizer, need, byte);
}
}
}
}
static enum apfl_parse_result
comment(apfl_tokenizer_ptr tokenizer)
{
unsigned char byte;
struct apfl_position pos = tokenizer->position;
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
for (;;) {
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_COMMENT,
.position = pos,
.text = apfl_string_builder_move_string(&text),
};
return APFL_PARSE_OK;
}
if (byte == '\n') {
unread_byte(tokenizer);
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_COMMENT,
.position = pos,
.text = apfl_string_builder_move_string(&text),
};
return APFL_PARSE_OK;
}
if (!apfl_string_builder_append_byte(&text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
}
}
static enum apfl_parse_result
colon(apfl_tokenizer_ptr tokenizer)
{
unsigned char byte;
struct apfl_position pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
return yield_simple_token(tokenizer, APFL_TOK_COLON, pos);
}
switch (byte) {
case '=':
return yield_simple_token(tokenizer, APFL_TOK_LOCAL_ASSIGN, pos);
case ':':
return yield_simple_token(tokenizer, APFL_TOK_DOUBLE_COLON, pos);
default:
unread_byte(tokenizer);
return yield_simple_token(tokenizer, APFL_TOK_COLON, pos);
}
}
static enum apfl_parse_result
append_single_byte(
apfl_tokenizer_ptr tokenizer,
struct apfl_string_builder *text,
char byte
) {
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
return APFL_PARSE_OK;
}
static enum apfl_parse_result
hex_escape(
apfl_tokenizer_ptr tokenizer,
struct apfl_string_builder *text
) {
unsigned char escaped_byte = 0;
for (int i = 0; i < 2; i++) {
unsigned char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
int nibble = apfl_parse_digit(byte);
if (nibble < 0 || nibble > 0xF) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE,
.position = tokenizer->position,
};
return APFL_PARSE_ERROR;
}
escaped_byte <<= 4;
escaped_byte |= 0xF & nibble;
}
return append_single_byte(tokenizer, text, escaped_byte);
}
static enum apfl_parse_result
escape_sequence(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
{
struct apfl_position pos = tokenizer->position;
unsigned char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
switch (byte) {
case 'x':
case 'X':
return hex_escape(tokenizer, text);
// case 'u':
// case 'U':
// return unicode_escape(tokenizer, pos, text);
case '\\':
return append_single_byte(tokenizer, text, '\\');
case 'n':
return append_single_byte(tokenizer, text, '\n');
case 'r':
return append_single_byte(tokenizer, text, '\r');
case 't':
return append_single_byte(tokenizer, text, '\t');
case '"':
return append_single_byte(tokenizer, text, '"');
case '0':
return append_single_byte(tokenizer, text, 0);
default:
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_INVALID_ESCAPE_SEQUENCE,
.position = pos,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
}
static enum apfl_parse_result
inner_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
{
struct apfl_position pos = tokenizer->position;
unsigned char byte;
enum apfl_parse_result subresult;
for (;;) {
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
switch (byte) {
case '"':
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_STRING,
.position = pos,
.text = apfl_string_builder_move_string(text),
};
return APFL_PARSE_OK;
case '\\':
if ((subresult = escape_sequence(tokenizer, text)) != APFL_PARSE_OK) {
return subresult;
}
break;
default:
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
}
}
}
static enum apfl_parse_result
string(apfl_tokenizer_ptr tokenizer)
{
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
enum apfl_parse_result out = inner_string(tokenizer, &text);
apfl_string_builder_deinit(&text);
return out;
}
static enum apfl_parse_result
inner_backtick_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
{
struct apfl_position pos = tokenizer->position;
unsigned char byte;
for (;;) {
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
if (byte != '`') {
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
continue;
}
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
goto finalize;
}
if (byte == '`') {
if (!apfl_string_builder_append_byte(text, '`')) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
continue;
}
unread_byte(tokenizer);
goto finalize;
}
finalize:
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_STRING,
.position = pos,
.text = apfl_string_builder_move_string(text),
};
return APFL_PARSE_OK;
}
static enum apfl_parse_result
backtick_string(apfl_tokenizer_ptr tokenizer)
{
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
enum apfl_parse_result out = inner_backtick_string(tokenizer, &text);
apfl_string_builder_deinit(&text);
return out;
}
static enum apfl_parse_result
finalize_maybe_name(
apfl_tokenizer_ptr tokenizer,
struct apfl_string_builder *text,
struct apfl_position pos
) {
assert(text->len > 0);
if (text->len == 1 && text->bytes[0] == '=') {
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_ASSIGN,
.position = pos,
};
} else {
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_NAME,
.position = pos,
.text = apfl_string_builder_move_string(text),
};
}
return APFL_PARSE_OK;
}
static bool
is_word_byte(unsigned char byte)
{
return isalnum(byte) || byte > 0x7F;
}
static enum apfl_parse_result
maybe_name_inner(
apfl_tokenizer_ptr tokenizer,
bool need,
unsigned char byte,
struct apfl_string_builder *text
) {
struct apfl_position pos = tokenizer->position;
struct apfl_position last_pos;
unsigned char last_byte;
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
for (;;) {
last_byte = byte;
last_pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
return finalize_maybe_name(tokenizer, text, pos);
}
switch (byte) {
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '~':
case '.':
case '@':
case ';':
case '\n':
case '\\':
case ',':
case '?':
case '\'':
case '#':
case ':':
case '"':
case '`':
case ' ':
case '\r':
case '\t':
unread_byte(tokenizer);
return finalize_maybe_name(tokenizer, text, pos);
case '=':
if (is_word_byte(last_byte)) {
tokenizer->next_mode = NM_ASSIGN;
return finalize_maybe_name(tokenizer, text, pos);
}
break;
case '>':
if (last_byte == '-') {
text->len--; // This removes the '-' from the end of text
if (text->len == 0) {
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, last_pos);
}
tokenizer->next_mode = NM_MAPSTO;
tokenizer->pos_for_mapsto = last_pos;
return finalize_maybe_name(tokenizer, text, pos);
}
break;
default:
if (is_control_byte(byte)) {
// Disallow ASCII control characters in names
unread_byte(tokenizer);
return finalize_maybe_name(tokenizer, text, pos);
}
break;
}
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
}
}
static enum apfl_parse_result
maybe_name(apfl_tokenizer_ptr tokenizer, bool need, unsigned char first_byte)
{
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
enum apfl_parse_result out = maybe_name_inner(tokenizer, need, first_byte, &text);
apfl_string_builder_deinit(&text);
return out;
}
static struct apfl_token
build_number_token(double number, struct apfl_position position, bool negative)
{
if (negative) {
number *= -1;
}
return (struct apfl_token) {
.type = APFL_TOK_NUMBER,
.position = position,
.number = (apfl_number)number,
};
}
static enum apfl_parse_result
zero(apfl_tokenizer_ptr tokenizer, struct apfl_position position, bool negative)
{
unsigned char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = build_number_token(0, position, negative);
return APFL_PARSE_OK;
}
switch (byte) {
case 'x':
case 'X':
return number(tokenizer, 16, position, negative);
case 'o':
case 'O':
return number(tokenizer, 8, position, negative);
case 'b':
case 'B':
return number(tokenizer, 2, position, negative);
default:
unread_byte(tokenizer);
return number(tokenizer, 10, position, negative);
}
}
static enum read_result
read_for_parse_number(void *opaque, unsigned char *byte)
{
apfl_tokenizer_ptr tokenizer = opaque;
return read_byte(tokenizer, byte, true);
}
static void
unread_for_parse_number(void *opaque)
{
apfl_tokenizer_ptr tokenizer = opaque;
unread_byte(tokenizer);
}
static enum apfl_parse_result
number(apfl_tokenizer_ptr tokenizer, unsigned base, struct apfl_position pos, bool negative)
{
apfl_number num;
if (!apfl_parse_number(
base,
read_for_parse_number,
unread_for_parse_number,
tokenizer,
&num
)) {
return APFL_PARSE_ERROR;
}
unsigned char byte;
switch (read_byte(tokenizer, &byte, false)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = build_number_token(num, pos, negative);
return APFL_PARSE_OK;
}
if (is_word_byte(byte)) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
unread_byte(tokenizer);
tokenizer->token = build_number_token(num, pos, negative);
return APFL_PARSE_OK;
}
static enum apfl_parse_result
token_source_wrap_next(void *opaque, bool need)
{
return apfl_tokenizer_next(opaque, need);
}
static struct apfl_token
token_source_wrap_get_token(void *opaque)
{
return apfl_tokenizer_get_token(opaque);
}
static struct apfl_error
token_source_wrap_get_error(void *opaque)
{
return apfl_tokenizer_get_error(opaque);
}
struct apfl_parser_token_source
apfl_tokenizer_as_token_source(apfl_tokenizer_ptr p)
{
return (struct apfl_parser_token_source) {
.next = token_source_wrap_next,
.get_token = token_source_wrap_get_token,
.get_error = token_source_wrap_get_error,
.opaque = p,
};
}