This is useful when writing strings that contain a lot of backslashes, e.g. in regular expressions.
872 lines
24 KiB
C
872 lines
24 KiB
C
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <limits.h>
|
|
#include <stdbool.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "apfl.h"
|
|
|
|
#include "alloc.h"
|
|
#include "parsing.h"
|
|
|
|
#define BUFSIZE 4096
|
|
typedef int buf_offset;
|
|
static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset");
|
|
static_assert(BUFSIZE >= 2, "BUFSIZE must be at least 2");
|
|
|
|
struct apfl_tokenizer {
|
|
struct apfl_allocator allocator;
|
|
struct apfl_source_reader source_reader;
|
|
unsigned char *buf;
|
|
buf_offset buf_pos;
|
|
buf_offset buf_len;
|
|
|
|
enum {
|
|
NM_REGULAR,
|
|
NM_MAPSTO,
|
|
NM_ASSIGN,
|
|
NM_EOF,
|
|
} next_mode;
|
|
struct apfl_position pos_for_mapsto;
|
|
|
|
struct apfl_position position;
|
|
struct apfl_position last_position;
|
|
bool last_byte_was_linebreak;
|
|
bool prev_last_byte_was_linebreak;
|
|
|
|
union {
|
|
struct apfl_token token;
|
|
struct apfl_error error;
|
|
};
|
|
};
|
|
|
|
apfl_tokenizer_ptr
|
|
apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_source_reader source_reader)
|
|
{
|
|
apfl_tokenizer_ptr tokenizer = ALLOC_OBJ(allocator, struct apfl_tokenizer);
|
|
if (tokenizer == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
tokenizer->allocator = allocator;
|
|
tokenizer->source_reader = source_reader;
|
|
|
|
if ((tokenizer->buf = ALLOC_BYTES(allocator, BUFSIZE)) == NULL) {
|
|
FREE_OBJ(allocator, tokenizer);
|
|
return NULL;
|
|
}
|
|
|
|
tokenizer->buf_pos = 0;
|
|
tokenizer->buf_len = 0;
|
|
|
|
tokenizer->position = (struct apfl_position) {
|
|
.line = 1,
|
|
.col = 0, // The first character was not yet read
|
|
};
|
|
tokenizer->last_byte_was_linebreak = false;
|
|
tokenizer->prev_last_byte_was_linebreak = false;
|
|
|
|
tokenizer->next_mode = NM_REGULAR;
|
|
|
|
return tokenizer;
|
|
}
|
|
|
|
void
|
|
apfl_tokenizer_destroy(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
if (tokenizer == NULL) {
|
|
return;
|
|
}
|
|
|
|
FREE_BYTES(tokenizer->allocator, tokenizer->buf, BUFSIZE);
|
|
FREE_OBJ(tokenizer->allocator, tokenizer);
|
|
}
|
|
|
|
struct apfl_token
|
|
apfl_tokenizer_get_token(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
return tokenizer->token;
|
|
}
|
|
|
|
struct apfl_error
|
|
apfl_tokenizer_get_error(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
return tokenizer->error;
|
|
}
|
|
|
|
static enum read_result
|
|
read_byte(apfl_tokenizer_ptr tokenizer, unsigned char *byte, bool need)
|
|
{
|
|
if (tokenizer->buf_pos >= tokenizer->buf_len) {
|
|
size_t off = 0;
|
|
if (tokenizer->buf_len > 0) {
|
|
off = 1;
|
|
tokenizer->buf[0] = tokenizer->buf[tokenizer->buf_len - 1];
|
|
}
|
|
|
|
size_t len = BUFSIZE - off;
|
|
|
|
tokenizer->buf_pos = off;
|
|
tokenizer->buf_len = off;
|
|
|
|
if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf+off, &len, need)) {
|
|
tokenizer->error.type = APFL_ERR_INPUT_ERROR;
|
|
return RR_ERR;
|
|
}
|
|
|
|
tokenizer->buf_len = len + off;
|
|
|
|
if (len == 0) {
|
|
return RR_EOF;
|
|
}
|
|
}
|
|
|
|
tokenizer->prev_last_byte_was_linebreak = tokenizer->last_byte_was_linebreak;
|
|
tokenizer->last_position = tokenizer->position;
|
|
|
|
if (tokenizer->last_byte_was_linebreak) {
|
|
tokenizer->position.line++;
|
|
tokenizer->position.col = 0;
|
|
}
|
|
|
|
*byte = tokenizer->buf[tokenizer->buf_pos];
|
|
tokenizer->buf_pos++;
|
|
|
|
tokenizer->last_byte_was_linebreak = (*byte == '\n');
|
|
tokenizer->position.col++;
|
|
|
|
return RR_OK;
|
|
}
|
|
|
|
// Only at most 1 unread_byte() call is allowed after a read_byte() call!
|
|
static void
|
|
unread_byte(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
tokenizer->position = tokenizer->last_position;
|
|
tokenizer->last_byte_was_linebreak = tokenizer->prev_last_byte_was_linebreak;
|
|
|
|
assert(tokenizer->buf_pos > 0);
|
|
tokenizer->buf_pos--;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
yield_simple_token(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
enum apfl_token_type type,
|
|
struct apfl_position pos
|
|
) {
|
|
tokenizer->token.type = type;
|
|
tokenizer->token.position = pos;
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result comment(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result colon(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result string(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr);
|
|
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char);
|
|
static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool);
|
|
static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool);
|
|
|
|
static bool
|
|
is_control_byte(unsigned char byte)
|
|
{
|
|
return byte < 0x20 || byte == 0x7F;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
minus(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
struct apfl_string str = apfl_string_blank();
|
|
if (!apfl_string_copy(tokenizer->allocator, &str, apfl_string_view_from("-"))) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_NAME,
|
|
.position = pos,
|
|
.text = str,
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
switch (byte) {
|
|
case '0':
|
|
return zero(tokenizer, pos, true);
|
|
case '>':
|
|
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, pos);
|
|
default:
|
|
unread_byte(tokenizer);
|
|
if (isdigit(byte)) {
|
|
return number(tokenizer, 10, pos, true);
|
|
} else {
|
|
return maybe_name(tokenizer, true, '-');
|
|
}
|
|
}
|
|
}
|
|
|
|
enum apfl_parse_result
|
|
apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
|
|
{
|
|
switch (tokenizer->next_mode) {
|
|
case NM_REGULAR:
|
|
break;
|
|
case NM_MAPSTO:
|
|
tokenizer->next_mode = NM_REGULAR;
|
|
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto);
|
|
case NM_ASSIGN:
|
|
tokenizer->next_mode = NM_REGULAR;
|
|
return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position);
|
|
case NM_EOF:
|
|
return APFL_PARSE_EOF;
|
|
}
|
|
|
|
unsigned char byte;
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte, need)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
return APFL_PARSE_EOF;
|
|
}
|
|
|
|
switch (byte) {
|
|
case '(':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LPAREN, tokenizer->position);
|
|
case ')':
|
|
return yield_simple_token(tokenizer, APFL_TOK_RPAREN, tokenizer->position);
|
|
case '[':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LBRACKET, tokenizer->position);
|
|
case ']':
|
|
return yield_simple_token(tokenizer, APFL_TOK_RBRACKET, tokenizer->position);
|
|
case '{':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LBRACE, tokenizer->position);
|
|
case '}':
|
|
return yield_simple_token(tokenizer, APFL_TOK_RBRACE, tokenizer->position);
|
|
case '~':
|
|
return yield_simple_token(tokenizer, APFL_TOK_EXPAND, tokenizer->position);
|
|
case '.':
|
|
return yield_simple_token(tokenizer, APFL_TOK_DOT, tokenizer->position);
|
|
case '@':
|
|
return yield_simple_token(tokenizer, APFL_TOK_AT, tokenizer->position);
|
|
case ';':
|
|
return yield_simple_token(tokenizer, APFL_TOK_SEMICOLON, tokenizer->position);
|
|
case '\n':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LINEBREAK, tokenizer->position);
|
|
case '\\':
|
|
return yield_simple_token(tokenizer, APFL_TOK_CONTINUE_LINE, tokenizer->position);
|
|
case ',':
|
|
return yield_simple_token(tokenizer, APFL_TOK_COMMA, tokenizer->position);
|
|
case '?':
|
|
return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position);
|
|
case '\'':
|
|
return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position);
|
|
case '`':
|
|
return backtick_string(tokenizer);
|
|
case '#':
|
|
return comment(tokenizer);
|
|
case ':':
|
|
return colon(tokenizer);
|
|
case '"':
|
|
return string(tokenizer);
|
|
case '-':
|
|
return minus(tokenizer);
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
// Skip whitespace
|
|
break;
|
|
case '0':
|
|
return zero(tokenizer, tokenizer->position, false);
|
|
default:
|
|
if (is_control_byte(byte)) {
|
|
// Disallow ASCII control characters here
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_UNEXPECTED_BYTE,
|
|
.position = tokenizer->position,
|
|
.byte = byte,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
} else if (isdigit(byte)) {
|
|
struct apfl_position position = tokenizer->position;
|
|
unread_byte(tokenizer);
|
|
return number(tokenizer, 10, position, false);
|
|
} else {
|
|
return maybe_name(tokenizer, need, byte);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
comment(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
unsigned char byte;
|
|
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_COMMENT,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(&text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
if (byte == '\n') {
|
|
unread_byte(tokenizer);
|
|
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_COMMENT,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(&text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
if (!apfl_string_builder_append_byte(&text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
colon(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
unsigned char byte;
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
return yield_simple_token(tokenizer, APFL_TOK_COLON, pos);
|
|
}
|
|
|
|
switch (byte) {
|
|
case '=':
|
|
return yield_simple_token(tokenizer, APFL_TOK_LOCAL_ASSIGN, pos);
|
|
case ':':
|
|
return yield_simple_token(tokenizer, APFL_TOK_DOUBLE_COLON, pos);
|
|
default:
|
|
unread_byte(tokenizer);
|
|
return yield_simple_token(tokenizer, APFL_TOK_COLON, pos);
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
append_single_byte(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
struct apfl_string_builder *text,
|
|
char byte
|
|
) {
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
hex_escape(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
struct apfl_string_builder *text
|
|
) {
|
|
unsigned char escaped_byte = 0;
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
unsigned char byte;
|
|
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
int nibble = apfl_parse_digit(byte);
|
|
if (nibble < 0 || nibble > 0xF) {
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE,
|
|
.position = tokenizer->position,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
escaped_byte <<= 4;
|
|
escaped_byte |= 0xF & nibble;
|
|
}
|
|
|
|
return append_single_byte(tokenizer, text, escaped_byte);
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
escape_sequence(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
switch (byte) {
|
|
case 'x':
|
|
case 'X':
|
|
return hex_escape(tokenizer, text);
|
|
// case 'u':
|
|
// case 'U':
|
|
// return unicode_escape(tokenizer, pos, text);
|
|
case '\\':
|
|
return append_single_byte(tokenizer, text, '\\');
|
|
case 'n':
|
|
return append_single_byte(tokenizer, text, '\n');
|
|
case 'r':
|
|
return append_single_byte(tokenizer, text, '\r');
|
|
case 't':
|
|
return append_single_byte(tokenizer, text, '\t');
|
|
case '"':
|
|
return append_single_byte(tokenizer, text, '"');
|
|
case '0':
|
|
return append_single_byte(tokenizer, text, 0);
|
|
default:
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_INVALID_ESCAPE_SEQUENCE,
|
|
.position = pos,
|
|
.byte = byte,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
inner_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
|
|
enum apfl_parse_result subresult;
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
switch (byte) {
|
|
case '"':
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_STRING,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
case '\\':
|
|
if ((subresult = escape_sequence(tokenizer, text)) != APFL_PARSE_OK) {
|
|
return subresult;
|
|
}
|
|
break;
|
|
default:
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
string(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
enum apfl_parse_result out = inner_string(tokenizer, &text);
|
|
|
|
apfl_string_builder_deinit(&text);
|
|
|
|
return out;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
inner_backtick_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
|
|
{
|
|
struct apfl_position pos = tokenizer->position;
|
|
|
|
unsigned char byte;
|
|
|
|
for (;;) {
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
if (byte != '`') {
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
goto finalize;
|
|
}
|
|
|
|
if (byte == '`') {
|
|
if (!apfl_string_builder_append_byte(text, '`')) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
unread_byte(tokenizer);
|
|
|
|
goto finalize;
|
|
}
|
|
|
|
finalize:
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_STRING,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(text),
|
|
};
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
backtick_string(apfl_tokenizer_ptr tokenizer)
|
|
{
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
enum apfl_parse_result out = inner_backtick_string(tokenizer, &text);
|
|
|
|
apfl_string_builder_deinit(&text);
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
static enum apfl_parse_result
|
|
finalize_maybe_name(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
struct apfl_string_builder *text,
|
|
struct apfl_position pos
|
|
) {
|
|
assert(text->len > 0);
|
|
|
|
if (text->len == 1 && text->bytes[0] == '=') {
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_ASSIGN,
|
|
.position = pos,
|
|
};
|
|
} else {
|
|
tokenizer->token = (struct apfl_token) {
|
|
.type = APFL_TOK_NAME,
|
|
.position = pos,
|
|
.text = apfl_string_builder_move_string(text),
|
|
};
|
|
}
|
|
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static bool
|
|
is_word_byte(unsigned char byte)
|
|
{
|
|
return isalnum(byte) || byte > 0x7F;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
maybe_name_inner(
|
|
apfl_tokenizer_ptr tokenizer,
|
|
bool need,
|
|
unsigned char byte,
|
|
struct apfl_string_builder *text
|
|
) {
|
|
struct apfl_position pos = tokenizer->position;
|
|
struct apfl_position last_pos;
|
|
unsigned char last_byte;
|
|
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
for (;;) {
|
|
last_byte = byte;
|
|
last_pos = tokenizer->position;
|
|
|
|
switch (read_byte(tokenizer, &byte, need)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
switch (byte) {
|
|
case '(':
|
|
case ')':
|
|
case '[':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '~':
|
|
case '.':
|
|
case '@':
|
|
case ';':
|
|
case '\n':
|
|
case '\\':
|
|
case ',':
|
|
case '?':
|
|
case '\'':
|
|
case '#':
|
|
case ':':
|
|
case '"':
|
|
case '`':
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
unread_byte(tokenizer);
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
case '=':
|
|
if (is_word_byte(last_byte)) {
|
|
tokenizer->next_mode = NM_ASSIGN;
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
break;
|
|
case '>':
|
|
if (last_byte == '-') {
|
|
text->len--; // This removes the '-' from the end of text
|
|
if (text->len == 0) {
|
|
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, last_pos);
|
|
}
|
|
|
|
tokenizer->next_mode = NM_MAPSTO;
|
|
tokenizer->pos_for_mapsto = last_pos;
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
break;
|
|
default:
|
|
if (is_control_byte(byte)) {
|
|
// Disallow ASCII control characters in names
|
|
unread_byte(tokenizer);
|
|
return finalize_maybe_name(tokenizer, text, pos);
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
if (!apfl_string_builder_append_byte(text, byte)) {
|
|
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
maybe_name(apfl_tokenizer_ptr tokenizer, bool need, unsigned char first_byte)
|
|
{
|
|
struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator);
|
|
|
|
enum apfl_parse_result out = maybe_name_inner(tokenizer, need, first_byte, &text);
|
|
|
|
apfl_string_builder_deinit(&text);
|
|
|
|
return out;
|
|
}
|
|
|
|
static struct apfl_token
|
|
build_number_token(double number, struct apfl_position position, bool negative)
|
|
{
|
|
if (negative) {
|
|
number *= -1;
|
|
}
|
|
|
|
return (struct apfl_token) {
|
|
.type = APFL_TOK_NUMBER,
|
|
.position = position,
|
|
.number = (apfl_number)number,
|
|
};
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
zero(apfl_tokenizer_ptr tokenizer, struct apfl_position position, bool negative)
|
|
{
|
|
unsigned char byte;
|
|
switch (read_byte(tokenizer, &byte, true)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->token = build_number_token(0, position, negative);
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
switch (byte) {
|
|
case 'x':
|
|
case 'X':
|
|
return number(tokenizer, 16, position, negative);
|
|
case 'o':
|
|
case 'O':
|
|
return number(tokenizer, 8, position, negative);
|
|
case 'b':
|
|
case 'B':
|
|
return number(tokenizer, 2, position, negative);
|
|
default:
|
|
unread_byte(tokenizer);
|
|
return number(tokenizer, 10, position, negative);
|
|
}
|
|
}
|
|
|
|
static enum read_result
|
|
read_for_parse_number(void *opaque, unsigned char *byte)
|
|
{
|
|
apfl_tokenizer_ptr tokenizer = opaque;
|
|
return read_byte(tokenizer, byte, true);
|
|
}
|
|
|
|
static void
|
|
unread_for_parse_number(void *opaque)
|
|
{
|
|
apfl_tokenizer_ptr tokenizer = opaque;
|
|
unread_byte(tokenizer);
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
number(apfl_tokenizer_ptr tokenizer, unsigned base, struct apfl_position pos, bool negative)
|
|
{
|
|
apfl_number num;
|
|
if (!apfl_parse_number(
|
|
base,
|
|
read_for_parse_number,
|
|
unread_for_parse_number,
|
|
tokenizer,
|
|
&num
|
|
)) {
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
unsigned char byte;
|
|
switch (read_byte(tokenizer, &byte, false)) {
|
|
case RR_OK:
|
|
break;
|
|
case RR_ERR:
|
|
return APFL_PARSE_ERROR;
|
|
case RR_EOF:
|
|
tokenizer->next_mode = NM_EOF;
|
|
tokenizer->token = build_number_token(num, pos, negative);
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
if (is_word_byte(byte)) {
|
|
tokenizer->error = (struct apfl_error) {
|
|
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
|
|
.position = tokenizer->position,
|
|
.byte = byte,
|
|
};
|
|
return APFL_PARSE_ERROR;
|
|
}
|
|
|
|
unread_byte(tokenizer);
|
|
tokenizer->token = build_number_token(num, pos, negative);
|
|
return APFL_PARSE_OK;
|
|
}
|
|
|
|
static enum apfl_parse_result
|
|
token_source_wrap_next(void *opaque, bool need)
|
|
{
|
|
return apfl_tokenizer_next(opaque, need);
|
|
}
|
|
|
|
static struct apfl_token
|
|
token_source_wrap_get_token(void *opaque)
|
|
{
|
|
return apfl_tokenizer_get_token(opaque);
|
|
}
|
|
|
|
static struct apfl_error
|
|
token_source_wrap_get_error(void *opaque)
|
|
{
|
|
return apfl_tokenizer_get_error(opaque);
|
|
}
|
|
|
|
|
|
struct apfl_parser_token_source
|
|
apfl_tokenizer_as_token_source(apfl_tokenizer_ptr p)
|
|
{
|
|
return (struct apfl_parser_token_source) {
|
|
.next = token_source_wrap_next,
|
|
.get_token = token_source_wrap_get_token,
|
|
.get_error = token_source_wrap_get_error,
|
|
.opaque = p,
|
|
};
|
|
}
|