apfl/src/tokenizer.c

910 lines
23 KiB
C
Raw Normal View History

2021-12-10 20:22:16 +00:00
#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include "apfl.h"
#define BUFSIZE 4096
typedef int buf_offset;
static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset");
struct apfl_tokenizer {
apfl_source_reader_cb source_reader;
void *source_reader_context;
char *buf;
buf_offset buf_pos;
buf_offset buf_len;
enum {
NM_REGULAR,
NM_NEGATIVE_NUMBER,
NM_MAPSTO,
NM_ASSIGN,
NM_EOF,
} next_mode;
struct apfl_position pos_for_mapsto;
char first_digit_for_negative_number;
struct apfl_position position;
bool last_byte_was_linebreak;
union {
struct apfl_token token;
struct apfl_error error;
};
};
apfl_tokenizer_ptr
apfl_tokenizer_new(apfl_source_reader_cb source_reader, void *context)
{
apfl_tokenizer_ptr tokenizer = malloc(sizeof(struct apfl_tokenizer));
if (tokenizer == NULL) {
return NULL;
}
tokenizer->source_reader = source_reader;
tokenizer->source_reader_context = context;
if ((tokenizer->buf = malloc(BUFSIZE)) == NULL) {
free(tokenizer);
return NULL;
}
tokenizer->buf_pos = 0;
tokenizer->buf_len = 0;
tokenizer->position = (struct apfl_position) {
.line = 1,
.col = 0, // The first character was not yet read
};
tokenizer->last_byte_was_linebreak = false;
tokenizer->next_mode = NM_REGULAR;
return tokenizer;
}
void
apfl_tokenizer_destroy(apfl_tokenizer_ptr tokenizer)
{
if (tokenizer == NULL) {
return;
}
free(tokenizer->buf);
free(tokenizer);
}
struct apfl_token
apfl_tokenizer_get_token(apfl_tokenizer_ptr tokenizer)
{
return tokenizer->token;
}
struct apfl_error
apfl_tokenizer_get_error(apfl_tokenizer_ptr tokenizer)
{
return tokenizer->error;
}
enum read_result {
RR_OK,
RR_ERR,
RR_EOF,
};
static enum read_result
read_byte(apfl_tokenizer_ptr tokenizer, char *byte, bool need)
{
if (tokenizer->buf_pos >= tokenizer->buf_len) {
size_t len = BUFSIZE;
tokenizer->buf_pos = 0;
tokenizer->buf_len = 0;
if (!tokenizer->source_reader(tokenizer->source_reader_context, tokenizer->buf, &len, need)) {
tokenizer->error.type = APFL_ERR_INPUT_ERROR;
return RR_ERR;
}
tokenizer->buf_len = len;
if (len == 0) {
return RR_EOF;
}
}
if (tokenizer->last_byte_was_linebreak) {
tokenizer->position.line++;
tokenizer->position.col = 0;
}
*byte = tokenizer->buf[tokenizer->buf_pos];
tokenizer->buf_pos++;
tokenizer->last_byte_was_linebreak = (*byte == '\n');
tokenizer->position.col++;
return RR_OK;
}
// Only at most 1 unread_byte() call is allowed after a read_byte() call!
static void
unread_byte(apfl_tokenizer_ptr tokenizer, struct apfl_position pos)
{
tokenizer->position = pos;
tokenizer->buf_pos--;
tokenizer->last_byte_was_linebreak = false;
}
static enum apfl_parse_result
yield_simple_token(
apfl_tokenizer_ptr tokenizer,
enum apfl_token_type type,
struct apfl_position pos
) {
tokenizer->token.type = type;
tokenizer->token.position = pos;
return APFL_PARSE_OK;
}
static enum apfl_parse_result comment(apfl_tokenizer_ptr);
static enum apfl_parse_result colon(apfl_tokenizer_ptr);
static enum apfl_parse_result string(apfl_tokenizer_ptr);
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, char);
static enum apfl_parse_result number(apfl_tokenizer_ptr, bool, struct apfl_position, char, bool);
enum apfl_parse_result
apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
{
switch (tokenizer->next_mode) {
case NM_REGULAR:
break;
case NM_MAPSTO:
tokenizer->next_mode = NM_REGULAR;
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto);
case NM_NEGATIVE_NUMBER:
tokenizer->next_mode = NM_REGULAR;
return number(tokenizer, need, tokenizer->position, tokenizer->first_digit_for_negative_number, true);
case NM_ASSIGN:
tokenizer->next_mode = NM_REGULAR;
return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position);
case NM_EOF:
return APFL_PARSE_EOF;
}
char byte;
for (;;) {
switch (read_byte(tokenizer, &byte, need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
return APFL_PARSE_EOF;
}
switch (byte) {
case '(':
return yield_simple_token(tokenizer, APFL_TOK_LPAREN, tokenizer->position);
case ')':
return yield_simple_token(tokenizer, APFL_TOK_RPAREN, tokenizer->position);
case '[':
return yield_simple_token(tokenizer, APFL_TOK_LBRACKET, tokenizer->position);
case ']':
return yield_simple_token(tokenizer, APFL_TOK_RBRACKET, tokenizer->position);
case '{':
return yield_simple_token(tokenizer, APFL_TOK_LBRACE, tokenizer->position);
case '}':
return yield_simple_token(tokenizer, APFL_TOK_RBRACE, tokenizer->position);
case '~':
return yield_simple_token(tokenizer, APFL_TOK_EXPAND, tokenizer->position);
case '.':
return yield_simple_token(tokenizer, APFL_TOK_DOT, tokenizer->position);
case '@':
return yield_simple_token(tokenizer, APFL_TOK_AT, tokenizer->position);
case ';':
return yield_simple_token(tokenizer, APFL_TOK_SEMICOLON, tokenizer->position);
case '\n':
return yield_simple_token(tokenizer, APFL_TOK_LINEBREAK, tokenizer->position);
case '\\':
return yield_simple_token(tokenizer, APFL_TOK_CONTINUE_LINE, tokenizer->position);
case ',':
return yield_simple_token(tokenizer, APFL_TOK_COMMA, tokenizer->position);
case '?':
return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position);
case '\'':
return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position);
case '#':
return comment(tokenizer);
case ':':
return colon(tokenizer);
case '"':
return string(tokenizer);
case ' ':
case '\r':
case '\t':
// Skip whitespace
break;
default:
if (isdigit(byte))
return number(tokenizer, need, tokenizer->position, byte, false);
else
return maybe_name(tokenizer, need, byte);
}
}
}
static enum apfl_parse_result
comment(apfl_tokenizer_ptr tokenizer)
{
char byte;
struct apfl_position pos = tokenizer->position;
struct apfl_position last_pos;
struct apfl_string_builder text;
apfl_string_builder_init(&text);
for (;;) {
last_pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_COMMENT,
.position = pos,
.text = apfl_string_builder_move_string(&text),
};
return APFL_PARSE_OK;
}
if (byte == '\n') {
unread_byte(tokenizer, last_pos);
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_COMMENT,
.position = pos,
.text = apfl_string_builder_move_string(&text),
};
return APFL_PARSE_OK;
}
if (!apfl_string_builder_append_byte(&text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
}
}
static enum apfl_parse_result
colon(apfl_tokenizer_ptr tokenizer)
{
char byte;
struct apfl_position pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
if (byte != '=') {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_EXPECTED_EQ_AFTER_COLON,
.position = tokenizer->position,
};
return APFL_PARSE_ERROR;
}
return yield_simple_token(tokenizer, APFL_TOK_LOCAL_ASSIGN, pos);
}
static enum apfl_parse_result
append_single_byte(
apfl_tokenizer_ptr tokenizer,
struct apfl_string_builder *text,
char byte
) {
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
return APFL_PARSE_OK;
}
static int
unhex(char byte)
{
switch (byte) {
case '0':
return 0x0;
case '1':
return 0x1;
case '2':
return 0x2;
case '3':
return 0x3;
case '4':
return 0x4;
case '5':
return 0x5;
case '6':
return 0x6;
case '7':
return 0x7;
case '8':
return 0x8;
case '9':
return 0x9;
case 'a':
case 'A':
return 0xA;
case 'b':
case 'B':
return 0xB;
case 'c':
case 'C':
return 0xC;
case 'd':
case 'D':
return 0xD;
case 'e':
case 'E':
return 0xE;
case 'f':
case 'F':
return 0xF;
}
return -1;
}
static int
undec(char byte)
{
switch (byte) {
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
case '8':
return 8;
case '9':
return 9;
}
return -1;
}
static int
unoct(char byte)
{
switch (byte) {
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
}
return -1;
}
static int
unbin(char byte)
{
switch (byte) {
case '0':
return 0;
case '1':
return 1;
}
return -1;
}
static enum apfl_parse_result
hex_escape(
apfl_tokenizer_ptr tokenizer,
struct apfl_string_builder *text
) {
char escaped_byte = 0;
for (int i = 0; i < 2; i++) {
char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
int nibble = unhex(byte);
if (nibble < 0) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE,
.position = tokenizer->position,
};
return APFL_PARSE_ERROR;
}
escaped_byte <<= 4;
escaped_byte |= 0xF & nibble;
}
return append_single_byte(tokenizer, text, escaped_byte);
}
static enum apfl_parse_result
escape_sequence(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
{
struct apfl_position pos = tokenizer->position;
char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
switch (byte) {
case 'x':
case 'X':
return hex_escape(tokenizer, text);
// case 'u':
// case 'U':
// return unicode_escape(tokenizer, pos, text);
case '\\':
return append_single_byte(tokenizer, text, '\\');
case 'n':
return append_single_byte(tokenizer, text, '\n');
case 'r':
return append_single_byte(tokenizer, text, '\r');
case 't':
return append_single_byte(tokenizer, text, '\t');
case '"':
return append_single_byte(tokenizer, text, '"');
case '0':
return append_single_byte(tokenizer, text, 0);
default:
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_INVALID_ESCAPE_SEQUENCE,
.position = pos,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
}
static enum apfl_parse_result
inner_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text)
{
struct apfl_position pos = tokenizer->position;
char byte;
enum apfl_parse_result subresult;
for (;;) {
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF };
return APFL_PARSE_ERROR;
}
switch (byte) {
case '"':
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_STRING,
.position = pos,
.text = apfl_string_builder_move_string(text),
};
return APFL_PARSE_OK;
case '\\':
if ((subresult = escape_sequence(tokenizer, text)) != APFL_PARSE_OK) {
return subresult;
}
break;
default:
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
}
}
}
static enum apfl_parse_result
string(apfl_tokenizer_ptr tokenizer)
{
struct apfl_string_builder text;
apfl_string_builder_init(&text);
enum apfl_parse_result out = inner_string(tokenizer, &text);
apfl_string_builder_deinit(&text);
return out;
}
static enum apfl_parse_result
finalize_maybe_name(
apfl_tokenizer_ptr tokenizer,
struct apfl_string_builder *text,
struct apfl_position pos
) {
assert(text->len > 0);
if (text->len == 1 && text->bytes[0] == '=') {
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_ASSIGN,
.position = pos,
};
} else {
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_NAME,
.position = pos,
.text = apfl_string_builder_move_string(text),
};
}
return APFL_PARSE_OK;
}
static bool
is_word_byte(unsigned char byte)
{
return isalnum(byte) || byte > 0x7F;
}
static enum apfl_parse_result
maybe_name_inner(
apfl_tokenizer_ptr tokenizer,
bool need,
char byte,
struct apfl_string_builder *text
) {
struct apfl_position pos = tokenizer->position;
struct apfl_position last_pos;
char last_byte;
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
for (;;) {
last_byte = byte;
last_pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
return finalize_maybe_name(tokenizer, text, pos);
}
switch (byte) {
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '~':
case '.':
case '@':
case ';':
case '\n':
case '\\':
case ',':
case '?':
case '\'':
case '#':
case ':':
case '"':
case ' ':
case '\r':
case '\t':
unread_byte(tokenizer, last_pos);
return finalize_maybe_name(tokenizer, text, pos);
case '=':
if (is_word_byte(last_byte)) {
tokenizer->next_mode = NM_ASSIGN;
return finalize_maybe_name(tokenizer, text, pos);
}
break;
case '>':
if (last_byte == '-') {
text->len--; // This removes the '-' from the end of text
if (text->len == 0) {
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, last_pos);
}
tokenizer->next_mode = NM_MAPSTO;
tokenizer->pos_for_mapsto = last_pos;
return finalize_maybe_name(tokenizer, text, pos);
}
break;
default:
if (isdigit(byte) && last_byte == '-') {
text->len--; // This removes the '-' from the end of text
if (text->len == 0) {
return number(tokenizer, need, pos, byte, true);
}
tokenizer->next_mode = NM_NEGATIVE_NUMBER;
tokenizer->first_digit_for_negative_number = byte;
return finalize_maybe_name(tokenizer, text, pos);
}
break;
}
if (!apfl_string_builder_append_byte(text, byte)) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
}
}
static enum apfl_parse_result
maybe_name(apfl_tokenizer_ptr tokenizer, bool need, char first_byte)
{
struct apfl_string_builder text;
apfl_string_builder_init(&text);
enum apfl_parse_result out = maybe_name_inner(tokenizer, need, first_byte, &text);
apfl_string_builder_deinit(&text);
return out;
}
static struct apfl_token
build_number_token(double number, struct apfl_position position, bool negative)
{
if (negative) {
number *= -1;
}
return (struct apfl_token) {
.type = APFL_TOK_NUMBER,
.position = position,
.number = (apfl_number)number,
};
}
static enum apfl_parse_result
non_decimal_number(
apfl_tokenizer_ptr tokenizer,
bool need,
struct apfl_position position,
bool negative,
int shift,
int (*byte_to_digit)(char))
{
struct apfl_position last_pos;
bool no_digits_yet = true;
char byte;
uint64_t num = 0;
for (;;) {
last_pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, no_digits_yet || need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
if (no_digits_yet) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_EOF,
};
return APFL_PARSE_ERROR;
} else {
tokenizer->token = build_number_token((double)num, position, negative);
return APFL_PARSE_OK;
}
}
int digit = byte_to_digit(byte);
if (digit >= 0) {
num <<= shift;
num |= digit;
no_digits_yet = false;
continue;
}
if (no_digits_yet) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_EXPECTED_DIGIT,
.position = tokenizer->position,
};
return APFL_PARSE_ERROR;
}
if (is_word_byte(byte)) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
unread_byte(tokenizer, last_pos);
tokenizer->token = build_number_token((double)num, position, negative);
return APFL_PARSE_OK;
}
}
#define BUILD_NON_DECIMAL_TOKENIZER(name, shift, byte_to_digit) \
static enum apfl_parse_result \
name( \
apfl_tokenizer_ptr tokenizer, \
bool need, \
struct apfl_position position, \
bool negative \
) { \
return non_decimal_number( \
tokenizer, \
need, \
position, \
negative, \
shift, \
byte_to_digit \
); \
}
BUILD_NON_DECIMAL_TOKENIZER(hex_number, 4, unhex)
BUILD_NON_DECIMAL_TOKENIZER(oct_number, 3, unoct)
BUILD_NON_DECIMAL_TOKENIZER(bin_number, 1, unbin)
static enum apfl_parse_result
number(
apfl_tokenizer_ptr tokenizer,
bool need,
struct apfl_position position,
char first_digit,
bool negative
) {
double num = (double)undec(first_digit);
double divider = 1;
bool first_iteration = true;
bool seen_dot = false;
struct apfl_position last_pos;
for (;; first_iteration = false) {
char byte;
last_pos = tokenizer->position;
switch (read_byte(tokenizer, &byte, need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = build_number_token(num / divider, position, negative);
return APFL_PARSE_OK;
}
if (first_iteration && first_digit == '0') {
switch (byte) {
case 'x':
case 'X':
return hex_number(tokenizer, need, position, negative);
case 'b':
case 'B':
return bin_number(tokenizer, need, position, negative);
case 'o':
case 'O':
return oct_number(tokenizer, need, position, negative);
}
}
int digit = undec(byte);
if (digit >= 0) {
num *= 10;
num += (double)digit;
if (seen_dot) {
divider *= 10;
}
continue;
}
if (byte == '.') {
if (seen_dot) {
unread_byte(tokenizer, last_pos);
tokenizer->token = build_number_token(num / divider, position, negative);
return APFL_PARSE_OK;
} else {
seen_dot = true;
continue;
}
}
if (is_word_byte(byte)) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
unread_byte(tokenizer, last_pos);
tokenizer->token = build_number_token(num / divider, position, negative);
return APFL_PARSE_OK;
}
}