Instead of passing a flag through the parser and tokenizer for telling the input source if we need further input or not, we steal a trick from Lua: In the REPL, we just continue to read lines and append them to the input, until the input was loaded with no "unexpected EOF" error. After all, when we didn't expect an EOF is exactly the scenario, when we need more input. Doing things this way simplifies a bunch of places and lets us remove the ugly source_reader and iterative_runner concepts. To allow the REPL to see the error that happened during loading required some smaller refactorings, but those were honestly for the better anyway. I also decided to get rid of the token_source concept, the parser now gets the tokenizer directly. This also made things a bit simpler, also I want to soon-ish implement string interpolation, and for that the parser needs to do more with the tokenizer than just reading the next token. One last thing: This also cleans up the web playground and makes the playground and REPL share a bunch of code. Nice!
841 lines
23 KiB
C
841 lines
23 KiB
C
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <limits.h>
|
|
#include <stdbool.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "apfl.h"
|
|
|
|
#include "alloc.h"
|
|
#include "parsing.h"
|
|
|
|
#define BUFSIZE 4096
|
|
typedef int buf_offset;
|
|
static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset");
|
|
static_assert(BUFSIZE >= 2, "BUFSIZE must be at least 2");
|
|
|
|
struct apfl_tokenizer {
|
|
struct apfl_allocator allocator;
|
|
struct apfl_io_reader source_reader;
|
|
unsigned char *buf;
|
|
buf_offset buf_pos;
|
|
buf_offset buf_len;
|
|
|
|
enum {
|
|
NM_REGULAR,
|
|
NM_MAPSTO,
|
|
NM_ASSIGN,
|
|
NM_EOF,
|
|
} next_mode;
|
|
struct apfl_position pos_for_mapsto;
|
|
|
|
struct apfl_position position;
|
|
struct apfl_position last_position;
|
|
bool last_byte_was_linebreak;
|
|
bool prev_last_byte_was_linebreak;
|
|
|
|
union {
|
|
struct apfl_token token;
|
|
struct apfl_error error;
|
|
};
|
|
};
|
|
|
|
apfl_tokenizer_ptr
|
|
apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_io_reader source_reader)
|
|
{
|
|
apfl_tokenizer_ptr tokenizer = ALLOC_OBJ(allocator, struct apfl_tokenizer);
|
|
if (tokenizer == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
tokenizer->allocator = allocator;
|
|
tokenizer->source_reader = source_reader;
|
|
|
|
if ((tokenizer->buf = ALLOC_BYTES(allocator, BUFSIZE)) == NULL) {
|
|
FREE_OBJ(allocator, tokenizer);
|
|
return NULL;
|
|
}
|
|
|
|
tokenizer->buf_pos = 0;
|
|
tokenizer->buf_len = 0;
|
|
|
|
tokenizer->position = (struct apfl_position) {
|
|
.line = 1,
|
|
.col = 0, // The first character was not yet read
|
|
};
|
|
tokenizer->last_byte_was_linebreak = false;
|
|
tokenizer->prev_last_byte_was_linebreak = false;
|
|
|
|
tokenizer->next_mode = NM_REGULAR;
|
|
|
|
return tokenizer;
|
|
}
|
|
|
|
void
|
|
apfl_tokenizer_destroy(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
if (tokenizer == NULL) {
|
|
return;
|
|
}
|
|
|
|
FREE_BYTES(tokenizer->allocator, tokenizer->buf, BUFSIZE);
|
|
FREE_OBJ(tokenizer->allocator, tokenizer);
|
|
}
|
|
|
|
struct apfl_token
|
|
apfl_tokenizer_get_token(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
return tokenizer->token;
|
|
}
|
|
|
|
struct apfl_error
|
|
apfl_tokenizer_get_error(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
return tokenizer->error;
|
|
}
|
|
|
|
static enum read_result
|
|
read_byte(apfl_tokenizer_ptr tokenizer, unsigned char *byte)
|
|
{
|
|
if (tokenizer->buf_pos >= tokenizer->buf_len) {
|
|
size_t off = 0;
|
|
if (tokenizer->buf_len > 0) {
|
|
off = 1;
|
|
tokenizer->buf[0] = tokenizer->buf[tokenizer->buf_len - 1];
|
|
}
|
|
|
|
size_t len = BUFSIZE - off;
|
|
|
|
tokenizer->buf_pos = off;
|
|
tokenizer->buf_len = off;
|
|
|
|
if (!apfl_io_read_bytes(tokenizer->source_reader, tokenizer->buf+off, &len)) {
|
|
tokenizer->error.type = APFL_ERR_INPUT_ERROR;
|
|
return RR_ERR;
|
|
}
|
|
|
|
tokenizer->buf_len = len + off;
|
|
|
|
if (len == 0) {
|
|
return RR_EOF;
|
|
}
|
|
}
|
|
|
|
tokenizer->prev_last_byte_was_linebreak = tokenizer->last_byte_was_linebreak;
|
|
tokenizer->last_position = tokenizer->position;
|
|
|
|
if (tokenizer->last_byte_was_linebreak) {
|
|
tokenizer->position.line++;
|
|
tokenizer->position.col = 0;
|
|
}
|
|
|
|
*byte = tokenizer->buf[tokenizer->buf_pos];
|
|
tokenizer->buf_pos++;
|
|
|
|
tokenizer->last_byte_was_linebreak = (*byte == '\n');
|
|
tokenizer->position.col++;
|
|
|
|
return RR_OK;
|
|
}
|
|
|
|
// Only at most 1 unread_byte() call is allowed after a read_byte() call!
|
|
static void
|
|
unread_byte(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
tokenizer->position = tokenizer->last_position;
|
|
tokenizer->last_byte_was_linebreak = tokenizer->prev_last_byte_was_linebreak;
|
|
|
|
assert(tokenizer->buf_pos > 0);
|
|
tokenizer->buf_pos--;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
yield_simple_token(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
enum apfl_token_type type,
|
|
struct apfl_position pos
|
|
) {
|
|
tokenizer->token.type = type;
|
|
tokenizer->token.position = pos;
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result comment(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result colon(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result string(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, unsigned char);
|
|
static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool);
|
|
static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool);
|
|
|
|
static bool
|
|
is_control_byte(unsigned char byte)
|
|
{
|
|
return byte < 0x20 || byte == 0x7F;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
minus(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
struct apfl_string str = apfl_string_blank();
|
|
if (!apfl_string_copy(tokenizer->allocator, &str, apfl_string_view_from("-"))) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_NAME,
|
|
.position = pos,
|
|
.text = str,
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
switch (byte) {
|
|
case '0':
|
|
return zero(tokenizer, pos, true);
|
|
case '>':
|
|
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, pos);
|
|
default:
|
|
unread_byte(tokenizer);
|
|
if (isdigit(byte)) {
|
|
return number(tokenizer, 10, pos, true);
|
|
} else {
|
|
return maybe_name(tokenizer, '-');
|
|
}
|
|
}
|
|
}
|
|
|
|
enum apfl_parse_result
|
|
apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
switch (tokenizer->next_mode) {
|
|
case NM_REGULAR:
|
|
break;
|
|
case NM_MAPSTO:
|
|
tokenizer->next_mode = NM_REGULAR;
|
|
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto);
|
|
case NM_ASSIGN:
|
|
tokenizer->next_mode = NM_REGULAR;
|
|
return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position);
|
|
case NM_EOF:
|
|
return APFL_PARSE_EOF;
|
|
}
|
|
|
|
unsigned char byte;
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
return APFL_PARSE_EOF;
|
|
}
|
|
|
|
switch (byte) {
|
|
case '(':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LPAREN, tokenizer->position);
|
|
case ')':
|
|
return yield_simple_token(tokenizer, APFL_TOK_RPAREN, tokenizer->position);
|
|
case '[':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LBRACKET, tokenizer->position);
|
|
case ']':
|
|
return yield_simple_token(tokenizer, APFL_TOK_RBRACKET, tokenizer->position);
|
|
case '{':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LBRACE, tokenizer->position);
|
|
case '}':
|
|
return yield_simple_token(tokenizer, APFL_TOK_RBRACE, tokenizer->position);
|
|
case '~':
|
|
return yield_simple_token(tokenizer, APFL_TOK_EXPAND, tokenizer->position);
|
|
case '.':
|
|
return yield_simple_token(tokenizer, APFL_TOK_DOT, tokenizer->position);
|
|
case '@':
|
|
return yield_simple_token(tokenizer, APFL_TOK_AT, tokenizer->position);
|
|
case ';':
|
|
return yield_simple_token(tokenizer, APFL_TOK_SEMICOLON, tokenizer->position);
|
|
case '\n':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LINEBREAK, tokenizer->position);
|
|
case '\\':
|
|
return yield_simple_token(tokenizer, APFL_TOK_CONTINUE_LINE, tokenizer->position);
|
|
case ',':
|
|
return yield_simple_token(tokenizer, APFL_TOK_COMMA, tokenizer->position);
|
|
case '?':
|
|
return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position);
|
|
case '\'':
|
|
return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position);
|
|
case '`':
|
|
return backtick_string(tokenizer);
|
|
case '#':
|
|
return comment(tokenizer);
|
|
case ':':
|
|
return colon(tokenizer);
|
|
case '"':
|
|
return string(tokenizer);
|
|
case '-':
|
|
return minus(tokenizer);
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
// Skip whitespace
|
|
break;
|
|
case '0':
|
|
return zero(tokenizer, tokenizer->position, false);
|
|
default:
|
|
if (is_control_byte(byte)) {
|
|
// Disallow ASCII control characters here
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_UNEXPECTED_BYTE,
|
|
.position = tokenizer->position,
|
|
.byte = byte,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
} else if (isdigit(byte)) {
|
|
struct apfl_position position = tokenizer->position;
|
|
unread_byte(tokenizer);
|
|
return number(tokenizer, 10, position, false);
|
|
} else {
|
|
return maybe_name(tokenizer, byte);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
comment(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
unsigned char byte;
|
|
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_COMMENT,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(&text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
if (byte == '\n') {
|
|
unread_byte(tokenizer);
|
|
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_COMMENT,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(&text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
if (!apfl_string_builder_append_byte(&text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
colon(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
unsigned char byte;
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
return yield_simple_token(tokenizer, APFL_TOK_COLON, pos);
|
|
}
|
|
|
|
switch (byte) {
|
|
case '=':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LOCAL_ASSIGN, pos);
|
|
case ':':
|
|
return yield_simple_token(tokenizer, APFL_TOK_DOUBLE_COLON, pos);
|
|
default:
|
|
unread_byte(tokenizer);
|
|
return yield_simple_token(tokenizer, APFL_TOK_COLON, pos);
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
append_single_byte(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
struct apfl_string_builder *text,
|
|
char byte
|
|
) {
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
hex_escape(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
struct apfl_string_builder *text
|
|
) {
|
|
unsigned char escaped_byte = 0;
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
unsigned char byte;
|
|
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
int nibble = apfl_parse_digit(byte);
|
|
if (nibble < 0 || nibble > 0xF) {
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE,
|
|
.position = tokenizer->position,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
escaped_byte <<= 4;
|
|
escaped_byte |= 0xF & nibble;
|
|
}
|
|
|
|
return append_single_byte(tokenizer, text, escaped_byte);
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
escape_sequence(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
switch (byte) {
|
|
case 'x':
|
|
case 'X':
|
|
return hex_escape(tokenizer, text);
|
|
// case 'u':
|
|
// case 'U':
|
|
// return unicode_escape(tokenizer, pos, text);
|
|
case '\\':
|
|
return append_single_byte(tokenizer, text, '\\');
|
|
case 'n':
|
|
return append_single_byte(tokenizer, text, '\n');
|
|
case 'r':
|
|
return append_single_byte(tokenizer, text, '\r');
|
|
case 't':
|
|
return append_single_byte(tokenizer, text, '\t');
|
|
case '"':
|
|
return append_single_byte(tokenizer, text, '"');
|
|
case '0':
|
|
return append_single_byte(tokenizer, text, 0);
|
|
default:
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_INVALID_ESCAPE_SEQUENCE,
|
|
.position = pos,
|
|
.byte = byte,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
inner_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
|
|
enum apfl_parse_result subresult;
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
switch (byte) {
|
|
case '"':
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_STRING,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
case '\\':
|
|
if ((subresult = escape_sequence(tokenizer, text)) != APFL_PARSE_OK) {
|
|
return subresult;
|
|
}
|
|
break;
|
|
default:
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
string(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
enum apfl_parse_result out = inner_string(tokenizer, &text);
|
|
|
|
apfl_string_builder_deinit(&text);
|
|
|
|
return out;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
inner_backtick_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
if (byte != '`') {
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
goto finalize;
|
|
}
|
|
|
|
if (byte == '`') {
|
|
if (!apfl_string_builder_append_byte(text, '`')) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
unread_byte(tokenizer);
|
|
|
|
goto finalize;
|
|
}
|
|
|
|
finalize:
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_STRING,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
backtick_string(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
enum apfl_parse_result out = inner_backtick_string(tokenizer, &text);
|
|
|
|
apfl_string_builder_deinit(&text);
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
static enum apfl_parse_result
|
|
finalize_maybe_name(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
struct apfl_string_builder *text,
|
|
struct apfl_position pos
|
|
) {
|
|
assert(text->len > 0);
|
|
|
|
if (text->len == 1 && text->bytes[0] == '=') {
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_ASSIGN,
|
|
.position = pos,
|
|
};
|
|
} else {
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_NAME,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(text),
|
|
};
|
|
}
|
|
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static bool
|
|
is_word_byte(unsigned char byte)
|
|
{
|
|
return isalnum(byte) || byte > 0x7F;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
maybe_name_inner(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
unsigned char byte,
|
|
struct apfl_string_builder *text
|
|
) {
|
|
struct apfl_position pos = tokenizer->position;
|
|
struct apfl_position last_pos;
|
|
unsigned char last_byte;
|
|
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
for (;;) {
|
|
last_byte = byte;
|
|
last_pos = tokenizer->position;
|
|
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
switch (byte) {
|
|
case '(':
|
|
case ')':
|
|
case '[':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '~':
|
|
case '.':
|
|
case '@':
|
|
case ';':
|
|
case '\n':
|
|
case '\\':
|
|
case ',':
|
|
case '?':
|
|
case '\'':
|
|
case '#':
|
|
case ':':
|
|
case '"':
|
|
case '`':
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
unread_byte(tokenizer);
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
case '=':
|
|
if (is_word_byte(last_byte)) {
|
|
tokenizer->next_mode = NM_ASSIGN;
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
break;
|
|
case '>':
|
|
if (last_byte == '-') {
|
|
text->len--; // This removes the '-' from the end of text
|
|
if (text->len == 0) {
|
|
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, last_pos);
|
|
}
|
|
|
|
tokenizer->next_mode = NM_MAPSTO;
|
|
tokenizer->pos_for_mapsto = last_pos;
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
break;
|
|
default:
|
|
if (is_control_byte(byte)) {
|
|
// Disallow ASCII control characters in names
|
|
unread_byte(tokenizer);
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
maybe_name(apfl_tokenizer_ptr tokenizer, unsigned char first_byte)
|
|
{
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
enum apfl_parse_result out = maybe_name_inner(tokenizer, first_byte, &text);
|
|
|
|
apfl_string_builder_deinit(&text);
|
|
|
|
return out;
|
|
}
|
|
|
|
static struct apfl_token
|
|
build_number_token(double number, struct apfl_position position, bool negative)
|
|
{
|
|
if (negative) {
|
|
number *= -1;
|
|
}
|
|
|
|
return (struct apfl_token) {
|
|
.type = APFL_TOK_NUMBER,
|
|
.position = position,
|
|
.number = (apfl_number)number,
|
|
};
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
zero(apfl_tokenizer_ptr tokenizer, struct apfl_position position, bool negative)
|
|
{
|
|
unsigned char byte;
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->token = build_number_token(0, position, negative);
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
switch (byte) {
|
|
case 'x':
|
|
case 'X':
|
|
return number(tokenizer, 16, position, negative);
|
|
case 'o':
|
|
case 'O':
|
|
return number(tokenizer, 8, position, negative);
|
|
case 'b':
|
|
case 'B':
|
|
return number(tokenizer, 2, position, negative);
|
|
default:
|
|
unread_byte(tokenizer);
|
|
return number(tokenizer, 10, position, negative);
|
|
}
|
|
}
|
|
|
|
static enum read_result
|
|
read_for_parse_number(void *opaque, unsigned char *byte)
|
|
{
|
|
apfl_tokenizer_ptr tokenizer = opaque;
|
|
return read_byte(tokenizer, byte);
|
|
}
|
|
|
|
static void
|
|
unread_for_parse_number(void *opaque)
|
|
{
|
|
apfl_tokenizer_ptr tokenizer = opaque;
|
|
unread_byte(tokenizer);
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
number(apfl_tokenizer_ptr tokenizer, unsigned base, struct apfl_position pos, bool negative)
|
|
{
|
|
apfl_number num;
|
|
if (!apfl_parse_number(
|
|
base,
|
|
read_for_parse_number,
|
|
unread_for_parse_number,
|
|
tokenizer,
|
|
&num
|
|
)) {
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
unsigned char byte;
|
|
switch (read_byte(tokenizer, &byte)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->token = build_number_token(num, pos, negative);
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
if (is_word_byte(byte)) {
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
|
|
.position = tokenizer->position,
|
|
.byte = byte,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
unread_byte(tokenizer);
|
|
tokenizer->token = build_number_token(num, pos, negative);
|
|
return APFL_PARSE_OK;
|
|
}
|