#include #include #include #include #include #include #include #include "apfl.h" #include "alloc.h" #include "parsing.h" #define BUFSIZE 4096 typedef int buf_offset; static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset"); static_assert(BUFSIZE >= 2, "BUFSIZE must be at least 2"); struct apfl_tokenizer { struct apfl_allocator allocator; struct apfl_source_reader source_reader; unsigned char *buf; buf_offset buf_pos; buf_offset buf_len; enum { NM_REGULAR, NM_MAPSTO, NM_ASSIGN, NM_EOF, } next_mode; struct apfl_position pos_for_mapsto; struct apfl_position position; struct apfl_position last_position; bool last_byte_was_linebreak; bool prev_last_byte_was_linebreak; union { struct apfl_token token; struct apfl_error error; }; }; apfl_tokenizer_ptr apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_source_reader source_reader) { apfl_tokenizer_ptr tokenizer = ALLOC_OBJ(allocator, struct apfl_tokenizer); if (tokenizer == NULL) { return NULL; } tokenizer->allocator = allocator; tokenizer->source_reader = source_reader; if ((tokenizer->buf = ALLOC_BYTES(allocator, BUFSIZE)) == NULL) { FREE_OBJ(allocator, tokenizer); return NULL; } tokenizer->buf_pos = 0; tokenizer->buf_len = 0; tokenizer->position = (struct apfl_position) { .line = 1, .col = 0, // The first character was not yet read }; tokenizer->last_byte_was_linebreak = false; tokenizer->prev_last_byte_was_linebreak = false; tokenizer->next_mode = NM_REGULAR; return tokenizer; } void apfl_tokenizer_destroy(apfl_tokenizer_ptr tokenizer) { if (tokenizer == NULL) { return; } FREE_BYTES(tokenizer->allocator, tokenizer->buf, BUFSIZE); FREE_OBJ(tokenizer->allocator, tokenizer); } struct apfl_token apfl_tokenizer_get_token(apfl_tokenizer_ptr tokenizer) { return tokenizer->token; } struct apfl_error apfl_tokenizer_get_error(apfl_tokenizer_ptr tokenizer) { return tokenizer->error; } static enum read_result read_byte(apfl_tokenizer_ptr tokenizer, unsigned char *byte, bool need) { if (tokenizer->buf_pos >= tokenizer->buf_len) { size_t off = 0; if (tokenizer->buf_len > 0) { off = 1; tokenizer->buf[0] = tokenizer->buf[tokenizer->buf_len - 1]; } size_t len = BUFSIZE - off; tokenizer->buf_pos = off; tokenizer->buf_len = off; if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf+off, &len, need)) { tokenizer->error.type = APFL_ERR_INPUT_ERROR; return RR_ERR; } tokenizer->buf_len = len + off; if (len == 0) { return RR_EOF; } } tokenizer->prev_last_byte_was_linebreak = tokenizer->last_byte_was_linebreak; tokenizer->last_position = tokenizer->position; if (tokenizer->last_byte_was_linebreak) { tokenizer->position.line++; tokenizer->position.col = 0; } *byte = tokenizer->buf[tokenizer->buf_pos]; tokenizer->buf_pos++; tokenizer->last_byte_was_linebreak = (*byte == '\n'); tokenizer->position.col++; return RR_OK; } // Only at most 1 unread_byte() call is allowed after a read_byte() call! static void unread_byte(apfl_tokenizer_ptr tokenizer) { tokenizer->position = tokenizer->last_position; tokenizer->last_byte_was_linebreak = tokenizer->prev_last_byte_was_linebreak; assert(tokenizer->buf_pos > 0); tokenizer->buf_pos--; } static enum apfl_parse_result yield_simple_token( apfl_tokenizer_ptr tokenizer, enum apfl_token_type type, struct apfl_position pos ) { tokenizer->token.type = type; tokenizer->token.position = pos; return APFL_PARSE_OK; } static enum apfl_parse_result comment(apfl_tokenizer_ptr); static enum apfl_parse_result colon(apfl_tokenizer_ptr); static enum apfl_parse_result string(apfl_tokenizer_ptr); static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr); static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char); static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool); static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool); static bool is_control_byte(unsigned char byte) { return byte < 0x20 || byte == 0x7F; } static enum apfl_parse_result minus(apfl_tokenizer_ptr tokenizer) { struct apfl_position pos = tokenizer->position; unsigned char byte; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; struct apfl_string str = apfl_string_blank(); if (!apfl_string_copy(tokenizer->allocator, &str, apfl_string_view_from("-"))) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } tokenizer->token = (struct apfl_token) { .type = APFL_TOK_NAME, .position = pos, .text = str, }; return APFL_PARSE_OK; } switch (byte) { case '0': return zero(tokenizer, pos, true); case '>': return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, pos); default: unread_byte(tokenizer); if (isdigit(byte)) { return number(tokenizer, 10, pos, true); } else { return maybe_name(tokenizer, true, '-'); } } } enum apfl_parse_result apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) { switch (tokenizer->next_mode) { case NM_REGULAR: break; case NM_MAPSTO: tokenizer->next_mode = NM_REGULAR; return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto); case NM_ASSIGN: tokenizer->next_mode = NM_REGULAR; return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position); case NM_EOF: return APFL_PARSE_EOF; } unsigned char byte; for (;;) { switch (read_byte(tokenizer, &byte, need)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; return APFL_PARSE_EOF; } switch (byte) { case '(': return yield_simple_token(tokenizer, APFL_TOK_LPAREN, tokenizer->position); case ')': return yield_simple_token(tokenizer, APFL_TOK_RPAREN, tokenizer->position); case '[': return yield_simple_token(tokenizer, APFL_TOK_LBRACKET, tokenizer->position); case ']': return yield_simple_token(tokenizer, APFL_TOK_RBRACKET, tokenizer->position); case '{': return yield_simple_token(tokenizer, APFL_TOK_LBRACE, tokenizer->position); case '}': return yield_simple_token(tokenizer, APFL_TOK_RBRACE, tokenizer->position); case '~': return yield_simple_token(tokenizer, APFL_TOK_EXPAND, tokenizer->position); case '.': return yield_simple_token(tokenizer, APFL_TOK_DOT, tokenizer->position); case '@': return yield_simple_token(tokenizer, APFL_TOK_AT, tokenizer->position); case ';': return yield_simple_token(tokenizer, APFL_TOK_SEMICOLON, tokenizer->position); case '\n': return yield_simple_token(tokenizer, APFL_TOK_LINEBREAK, tokenizer->position); case '\\': return yield_simple_token(tokenizer, APFL_TOK_CONTINUE_LINE, tokenizer->position); case ',': return yield_simple_token(tokenizer, APFL_TOK_COMMA, tokenizer->position); case '?': return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position); case '\'': return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position); case '`': return backtick_string(tokenizer); case '#': return comment(tokenizer); case ':': return colon(tokenizer); case '"': return string(tokenizer); case '-': return minus(tokenizer); case ' ': case '\r': case '\t': // Skip whitespace break; case '0': return zero(tokenizer, tokenizer->position, false); default: if (is_control_byte(byte)) { // Disallow ASCII control characters here tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_BYTE, .position = tokenizer->position, .byte = byte, }; return APFL_PARSE_ERROR; } else if (isdigit(byte)) { struct apfl_position position = tokenizer->position; unread_byte(tokenizer); return number(tokenizer, 10, position, false); } else { return maybe_name(tokenizer, need, byte); } } } } static enum apfl_parse_result comment(apfl_tokenizer_ptr tokenizer) { unsigned char byte; struct apfl_position pos = tokenizer->position; struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); for (;;) { switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->token = (struct apfl_token) { .type = APFL_TOK_COMMENT, .position = pos, .text = apfl_string_builder_move_string(&text), }; return APFL_PARSE_OK; } if (byte == '\n') { unread_byte(tokenizer); tokenizer->token = (struct apfl_token) { .type = APFL_TOK_COMMENT, .position = pos, .text = apfl_string_builder_move_string(&text), }; return APFL_PARSE_OK; } if (!apfl_string_builder_append_byte(&text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } } } static enum apfl_parse_result colon(apfl_tokenizer_ptr tokenizer) { unsigned char byte; struct apfl_position pos = tokenizer->position; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: return yield_simple_token(tokenizer, APFL_TOK_COLON, pos); } switch (byte) { case '=': return yield_simple_token(tokenizer, APFL_TOK_LOCAL_ASSIGN, pos); case ':': return yield_simple_token(tokenizer, APFL_TOK_DOUBLE_COLON, pos); default: unread_byte(tokenizer); return yield_simple_token(tokenizer, APFL_TOK_COLON, pos); } } static enum apfl_parse_result append_single_byte( apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text, char byte ) { if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } return APFL_PARSE_OK; } static enum apfl_parse_result hex_escape( apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text ) { unsigned char escaped_byte = 0; for (int i = 0; i < 2; i++) { unsigned char byte; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } int nibble = apfl_parse_digit(byte); if (nibble < 0 || nibble > 0xF) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE, .position = tokenizer->position, }; return APFL_PARSE_ERROR; } escaped_byte <<= 4; escaped_byte |= 0xF & nibble; } return append_single_byte(tokenizer, text, escaped_byte); } static enum apfl_parse_result escape_sequence(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text) { struct apfl_position pos = tokenizer->position; unsigned char byte; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } switch (byte) { case 'x': case 'X': return hex_escape(tokenizer, text); // case 'u': // case 'U': // return unicode_escape(tokenizer, pos, text); case '\\': return append_single_byte(tokenizer, text, '\\'); case 'n': return append_single_byte(tokenizer, text, '\n'); case 'r': return append_single_byte(tokenizer, text, '\r'); case 't': return append_single_byte(tokenizer, text, '\t'); case '"': return append_single_byte(tokenizer, text, '"'); case '0': return append_single_byte(tokenizer, text, 0); default: tokenizer->error = (struct apfl_error) { .type = APFL_ERR_INVALID_ESCAPE_SEQUENCE, .position = pos, .byte = byte, }; return APFL_PARSE_ERROR; } } static enum apfl_parse_result inner_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text) { struct apfl_position pos = tokenizer->position; unsigned char byte; enum apfl_parse_result subresult; for (;;) { switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } switch (byte) { case '"': tokenizer->token = (struct apfl_token) { .type = APFL_TOK_STRING, .position = pos, .text = apfl_string_builder_move_string(text), }; return APFL_PARSE_OK; case '\\': if ((subresult = escape_sequence(tokenizer, text)) != APFL_PARSE_OK) { return subresult; } break; default: if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } } } } static enum apfl_parse_result string(apfl_tokenizer_ptr tokenizer) { struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); enum apfl_parse_result out = inner_string(tokenizer, &text); apfl_string_builder_deinit(&text); return out; } static enum apfl_parse_result inner_backtick_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text) { struct apfl_position pos = tokenizer->position; unsigned char byte; for (;;) { switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } if (byte != '`') { if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } continue; } switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; goto finalize; } if (byte == '`') { if (!apfl_string_builder_append_byte(text, '`')) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } continue; } unread_byte(tokenizer); goto finalize; } finalize: tokenizer->token = (struct apfl_token) { .type = APFL_TOK_STRING, .position = pos, .text = apfl_string_builder_move_string(text), }; return APFL_PARSE_OK; } static enum apfl_parse_result backtick_string(apfl_tokenizer_ptr tokenizer) { struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); enum apfl_parse_result out = inner_backtick_string(tokenizer, &text); apfl_string_builder_deinit(&text); return out; } static enum apfl_parse_result finalize_maybe_name( apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text, struct apfl_position pos ) { assert(text->len > 0); if (text->len == 1 && text->bytes[0] == '=') { tokenizer->token = (struct apfl_token) { .type = APFL_TOK_ASSIGN, .position = pos, }; } else { tokenizer->token = (struct apfl_token) { .type = APFL_TOK_NAME, .position = pos, .text = apfl_string_builder_move_string(text), }; } return APFL_PARSE_OK; } static bool is_word_byte(unsigned char byte) { return isalnum(byte) || byte > 0x7F; } static enum apfl_parse_result maybe_name_inner( apfl_tokenizer_ptr tokenizer, bool need, unsigned char byte, struct apfl_string_builder *text ) { struct apfl_position pos = tokenizer->position; struct apfl_position last_pos; unsigned char last_byte; if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } for (;;) { last_byte = byte; last_pos = tokenizer->position; switch (read_byte(tokenizer, &byte, need)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; return finalize_maybe_name(tokenizer, text, pos); } switch (byte) { case '(': case ')': case '[': case ']': case '{': case '}': case '~': case '.': case '@': case ';': case '\n': case '\\': case ',': case '?': case '\'': case '#': case ':': case '"': case '`': case ' ': case '\r': case '\t': unread_byte(tokenizer); return finalize_maybe_name(tokenizer, text, pos); case '=': if (is_word_byte(last_byte)) { tokenizer->next_mode = NM_ASSIGN; return finalize_maybe_name(tokenizer, text, pos); } break; case '>': if (last_byte == '-') { text->len--; // This removes the '-' from the end of text if (text->len == 0) { return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, last_pos); } tokenizer->next_mode = NM_MAPSTO; tokenizer->pos_for_mapsto = last_pos; return finalize_maybe_name(tokenizer, text, pos); } break; default: if (is_control_byte(byte)) { // Disallow ASCII control characters in names unread_byte(tokenizer); return finalize_maybe_name(tokenizer, text, pos); } break; } if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } } } static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr tokenizer, bool need, unsigned char first_byte) { struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); enum apfl_parse_result out = maybe_name_inner(tokenizer, need, first_byte, &text); apfl_string_builder_deinit(&text); return out; } static struct apfl_token build_number_token(double number, struct apfl_position position, bool negative) { if (negative) { number *= -1; } return (struct apfl_token) { .type = APFL_TOK_NUMBER, .position = position, .number = (apfl_number)number, }; } static enum apfl_parse_result zero(apfl_tokenizer_ptr tokenizer, struct apfl_position position, bool negative) { unsigned char byte; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->token = build_number_token(0, position, negative); return APFL_PARSE_OK; } switch (byte) { case 'x': case 'X': return number(tokenizer, 16, position, negative); case 'o': case 'O': return number(tokenizer, 8, position, negative); case 'b': case 'B': return number(tokenizer, 2, position, negative); default: unread_byte(tokenizer); return number(tokenizer, 10, position, negative); } } static enum read_result read_for_parse_number(void *opaque, unsigned char *byte) { apfl_tokenizer_ptr tokenizer = opaque; return read_byte(tokenizer, byte, true); } static void unread_for_parse_number(void *opaque) { apfl_tokenizer_ptr tokenizer = opaque; unread_byte(tokenizer); } static enum apfl_parse_result number(apfl_tokenizer_ptr tokenizer, unsigned base, struct apfl_position pos, bool negative) { apfl_number num; if (!apfl_parse_number( base, read_for_parse_number, unread_for_parse_number, tokenizer, &num )) { return APFL_PARSE_ERROR; } unsigned char byte; switch (read_byte(tokenizer, &byte, false)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->token = build_number_token(num, pos, negative); return APFL_PARSE_OK; } if (is_word_byte(byte)) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, .position = tokenizer->position, .byte = byte, }; return APFL_PARSE_ERROR; } unread_byte(tokenizer); tokenizer->token = build_number_token(num, pos, negative); return APFL_PARSE_OK; } static enum apfl_parse_result token_source_wrap_next(void *opaque, bool need) { return apfl_tokenizer_next(opaque, need); } static struct apfl_token token_source_wrap_get_token(void *opaque) { return apfl_tokenizer_get_token(opaque); } static struct apfl_error token_source_wrap_get_error(void *opaque) { return apfl_tokenizer_get_error(opaque); } struct apfl_parser_token_source apfl_tokenizer_as_token_source(apfl_tokenizer_ptr p) { return (struct apfl_parser_token_source) { .next = token_source_wrap_next, .get_token = token_source_wrap_get_token, .get_error = token_source_wrap_get_error, .opaque = p, }; }