#include #include #include #include #include #include #include #include "apfl.h" #include "alloc.h" #define BUFSIZE 4096 typedef int buf_offset; static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset"); struct apfl_tokenizer { struct apfl_allocator allocator; struct apfl_source_reader source_reader; char *buf; buf_offset buf_pos; buf_offset buf_len; enum { NM_REGULAR, NM_NEGATIVE_NUMBER, NM_MAPSTO, NM_ASSIGN, NM_EOF, } next_mode; struct apfl_position pos_for_mapsto; char first_digit_for_negative_number; struct apfl_position position; bool last_byte_was_linebreak; union { struct apfl_token token; struct apfl_error error; }; }; apfl_tokenizer_ptr apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_source_reader source_reader) { apfl_tokenizer_ptr tokenizer = ALLOC_OBJ(allocator, struct apfl_tokenizer); if (tokenizer == NULL) { return NULL; } tokenizer->allocator = allocator; tokenizer->source_reader = source_reader; if ((tokenizer->buf = ALLOC_BYTES(allocator, BUFSIZE)) == NULL) { FREE_OBJ(allocator, tokenizer); return NULL; } tokenizer->buf_pos = 0; tokenizer->buf_len = 0; tokenizer->position = (struct apfl_position) { .line = 1, .col = 0, // The first character was not yet read }; tokenizer->last_byte_was_linebreak = false; tokenizer->next_mode = NM_REGULAR; return tokenizer; } void apfl_tokenizer_destroy(apfl_tokenizer_ptr tokenizer) { if (tokenizer == NULL) { return; } FREE_BYTES(tokenizer->allocator, tokenizer->buf, BUFSIZE); FREE_OBJ(tokenizer->allocator, tokenizer); } struct apfl_token apfl_tokenizer_get_token(apfl_tokenizer_ptr tokenizer) { return tokenizer->token; } struct apfl_error apfl_tokenizer_get_error(apfl_tokenizer_ptr tokenizer) { return tokenizer->error; } enum read_result { RR_OK, RR_ERR, RR_EOF, }; static enum read_result read_byte(apfl_tokenizer_ptr tokenizer, char *byte, bool need) { if (tokenizer->buf_pos >= tokenizer->buf_len) { size_t len = BUFSIZE; tokenizer->buf_pos = 0; tokenizer->buf_len = 0; if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf, &len, need)) { tokenizer->error.type = APFL_ERR_INPUT_ERROR; return RR_ERR; } tokenizer->buf_len = len; if (len == 0) { return RR_EOF; } } if (tokenizer->last_byte_was_linebreak) { tokenizer->position.line++; tokenizer->position.col = 0; } *byte = tokenizer->buf[tokenizer->buf_pos]; tokenizer->buf_pos++; tokenizer->last_byte_was_linebreak = (*byte == '\n'); tokenizer->position.col++; return RR_OK; } // Only at most 1 unread_byte() call is allowed after a read_byte() call! static void unread_byte(apfl_tokenizer_ptr tokenizer, struct apfl_position pos) { tokenizer->position = pos; tokenizer->buf_pos--; tokenizer->last_byte_was_linebreak = false; } static enum apfl_parse_result yield_simple_token( apfl_tokenizer_ptr tokenizer, enum apfl_token_type type, struct apfl_position pos ) { tokenizer->token.type = type; tokenizer->token.position = pos; return APFL_PARSE_OK; } static enum apfl_parse_result comment(apfl_tokenizer_ptr); static enum apfl_parse_result colon(apfl_tokenizer_ptr); static enum apfl_parse_result string(apfl_tokenizer_ptr); static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, char); static enum apfl_parse_result number(apfl_tokenizer_ptr, bool, struct apfl_position, char, bool); static bool is_control_byte(unsigned char byte) { return byte < 0x20 || byte == 0x7F; } enum apfl_parse_result apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) { switch (tokenizer->next_mode) { case NM_REGULAR: break; case NM_MAPSTO: tokenizer->next_mode = NM_REGULAR; return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto); case NM_NEGATIVE_NUMBER: tokenizer->next_mode = NM_REGULAR; return number(tokenizer, need, tokenizer->position, tokenizer->first_digit_for_negative_number, true); case NM_ASSIGN: tokenizer->next_mode = NM_REGULAR; return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position); case NM_EOF: return APFL_PARSE_EOF; } char byte; for (;;) { switch (read_byte(tokenizer, &byte, need)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; return APFL_PARSE_EOF; } switch (byte) { case '(': return yield_simple_token(tokenizer, APFL_TOK_LPAREN, tokenizer->position); case ')': return yield_simple_token(tokenizer, APFL_TOK_RPAREN, tokenizer->position); case '[': return yield_simple_token(tokenizer, APFL_TOK_LBRACKET, tokenizer->position); case ']': return yield_simple_token(tokenizer, APFL_TOK_RBRACKET, tokenizer->position); case '{': return yield_simple_token(tokenizer, APFL_TOK_LBRACE, tokenizer->position); case '}': return yield_simple_token(tokenizer, APFL_TOK_RBRACE, tokenizer->position); case '~': return yield_simple_token(tokenizer, APFL_TOK_EXPAND, tokenizer->position); case '.': return yield_simple_token(tokenizer, APFL_TOK_DOT, tokenizer->position); case '@': return yield_simple_token(tokenizer, APFL_TOK_AT, tokenizer->position); case ';': return yield_simple_token(tokenizer, APFL_TOK_SEMICOLON, tokenizer->position); case '\n': return yield_simple_token(tokenizer, APFL_TOK_LINEBREAK, tokenizer->position); case '\\': return yield_simple_token(tokenizer, APFL_TOK_CONTINUE_LINE, tokenizer->position); case ',': return yield_simple_token(tokenizer, APFL_TOK_COMMA, tokenizer->position); case '?': return yield_simple_token(tokenizer, APFL_TOK_QUESTION_MARK, tokenizer->position); case '\'': return yield_simple_token(tokenizer, APFL_TOK_STRINGIFY, tokenizer->position); case '#': return comment(tokenizer); case ':': return colon(tokenizer); case '"': return string(tokenizer); case ' ': case '\r': case '\t': // Skip whitespace break; default: if (is_control_byte(byte)) { // Disallow ASCII control characters here tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_BYTE, .position = tokenizer->position, .byte = byte, }; return APFL_PARSE_ERROR; } else if (isdigit(byte)) { return number(tokenizer, need, tokenizer->position, byte, false); } else { return maybe_name(tokenizer, need, byte); } } } } static enum apfl_parse_result comment(apfl_tokenizer_ptr tokenizer) { char byte; struct apfl_position pos = tokenizer->position; struct apfl_position last_pos; struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); for (;;) { last_pos = tokenizer->position; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->token = (struct apfl_token) { .type = APFL_TOK_COMMENT, .position = pos, .text = apfl_string_builder_move_string(&text), }; return APFL_PARSE_OK; } if (byte == '\n') { unread_byte(tokenizer, last_pos); tokenizer->token = (struct apfl_token) { .type = APFL_TOK_COMMENT, .position = pos, .text = apfl_string_builder_move_string(&text), }; return APFL_PARSE_OK; } if (!apfl_string_builder_append_byte(&text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } } } static enum apfl_parse_result colon(apfl_tokenizer_ptr tokenizer) { char byte; struct apfl_position pos = tokenizer->position; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } if (byte != '=') { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_EXPECTED_EQ_AFTER_COLON, .position = tokenizer->position, }; return APFL_PARSE_ERROR; } return yield_simple_token(tokenizer, APFL_TOK_LOCAL_ASSIGN, pos); } static enum apfl_parse_result append_single_byte( apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text, char byte ) { if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } return APFL_PARSE_OK; } static int unhex(char byte) { switch (byte) { case '0': return 0x0; case '1': return 0x1; case '2': return 0x2; case '3': return 0x3; case '4': return 0x4; case '5': return 0x5; case '6': return 0x6; case '7': return 0x7; case '8': return 0x8; case '9': return 0x9; case 'a': case 'A': return 0xA; case 'b': case 'B': return 0xB; case 'c': case 'C': return 0xC; case 'd': case 'D': return 0xD; case 'e': case 'E': return 0xE; case 'f': case 'F': return 0xF; } return -1; } static int undec(char byte) { switch (byte) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; case '7': return 7; case '8': return 8; case '9': return 9; } return -1; } static int unoct(char byte) { switch (byte) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; case '7': return 7; } return -1; } static int unbin(char byte) { switch (byte) { case '0': return 0; case '1': return 1; } return -1; } static enum apfl_parse_result hex_escape( apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text ) { char escaped_byte = 0; for (int i = 0; i < 2; i++) { char byte; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } int nibble = unhex(byte); if (nibble < 0) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE, .position = tokenizer->position, }; return APFL_PARSE_ERROR; } escaped_byte <<= 4; escaped_byte |= 0xF & nibble; } return append_single_byte(tokenizer, text, escaped_byte); } static enum apfl_parse_result escape_sequence(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text) { struct apfl_position pos = tokenizer->position; char byte; switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } switch (byte) { case 'x': case 'X': return hex_escape(tokenizer, text); // case 'u': // case 'U': // return unicode_escape(tokenizer, pos, text); case '\\': return append_single_byte(tokenizer, text, '\\'); case 'n': return append_single_byte(tokenizer, text, '\n'); case 'r': return append_single_byte(tokenizer, text, '\r'); case 't': return append_single_byte(tokenizer, text, '\t'); case '"': return append_single_byte(tokenizer, text, '"'); case '0': return append_single_byte(tokenizer, text, 0); default: tokenizer->error = (struct apfl_error) { .type = APFL_ERR_INVALID_ESCAPE_SEQUENCE, .position = pos, .byte = byte, }; return APFL_PARSE_ERROR; } } static enum apfl_parse_result inner_string(apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text) { struct apfl_position pos = tokenizer->position; char byte; enum apfl_parse_result subresult; for (;;) { switch (read_byte(tokenizer, &byte, true)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF }; return APFL_PARSE_ERROR; } switch (byte) { case '"': tokenizer->token = (struct apfl_token) { .type = APFL_TOK_STRING, .position = pos, .text = apfl_string_builder_move_string(text), }; return APFL_PARSE_OK; case '\\': if ((subresult = escape_sequence(tokenizer, text)) != APFL_PARSE_OK) { return subresult; } break; default: if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } } } } static enum apfl_parse_result string(apfl_tokenizer_ptr tokenizer) { struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); enum apfl_parse_result out = inner_string(tokenizer, &text); apfl_string_builder_deinit(&text); return out; } static enum apfl_parse_result finalize_maybe_name( apfl_tokenizer_ptr tokenizer, struct apfl_string_builder *text, struct apfl_position pos ) { assert(text->len > 0); if (text->len == 1 && text->bytes[0] == '=') { tokenizer->token = (struct apfl_token) { .type = APFL_TOK_ASSIGN, .position = pos, }; } else { tokenizer->token = (struct apfl_token) { .type = APFL_TOK_NAME, .position = pos, .text = apfl_string_builder_move_string(text), }; } return APFL_PARSE_OK; } static bool is_word_byte(unsigned char byte) { return isalnum(byte) || byte > 0x7F; } static enum apfl_parse_result maybe_name_inner( apfl_tokenizer_ptr tokenizer, bool need, char byte, struct apfl_string_builder *text ) { struct apfl_position pos = tokenizer->position; struct apfl_position last_pos; char last_byte; if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } for (;;) { last_byte = byte; last_pos = tokenizer->position; switch (read_byte(tokenizer, &byte, need)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; return finalize_maybe_name(tokenizer, text, pos); } switch (byte) { case '(': case ')': case '[': case ']': case '{': case '}': case '~': case '.': case '@': case ';': case '\n': case '\\': case ',': case '?': case '\'': case '#': case ':': case '"': case ' ': case '\r': case '\t': unread_byte(tokenizer, last_pos); return finalize_maybe_name(tokenizer, text, pos); case '=': if (is_word_byte(last_byte)) { tokenizer->next_mode = NM_ASSIGN; return finalize_maybe_name(tokenizer, text, pos); } break; case '>': if (last_byte == '-') { text->len--; // This removes the '-' from the end of text if (text->len == 0) { return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, last_pos); } tokenizer->next_mode = NM_MAPSTO; tokenizer->pos_for_mapsto = last_pos; return finalize_maybe_name(tokenizer, text, pos); } break; default: if (is_control_byte(byte)) { // Disallow ASCII control characters in names unread_byte(tokenizer, last_pos); return finalize_maybe_name(tokenizer, text, pos); } if (isdigit(byte) && last_byte == '-') { text->len--; // This removes the '-' from the end of text if (text->len == 0) { return number(tokenizer, need, pos, byte, true); } tokenizer->next_mode = NM_NEGATIVE_NUMBER; tokenizer->first_digit_for_negative_number = byte; return finalize_maybe_name(tokenizer, text, pos); } break; } if (!apfl_string_builder_append_byte(text, byte)) { tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); return APFL_PARSE_ERROR; } } } static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr tokenizer, bool need, char first_byte) { struct apfl_string_builder text = apfl_string_builder_init(tokenizer->allocator); enum apfl_parse_result out = maybe_name_inner(tokenizer, need, first_byte, &text); apfl_string_builder_deinit(&text); return out; } static struct apfl_token build_number_token(double number, struct apfl_position position, bool negative) { if (negative) { number *= -1; } return (struct apfl_token) { .type = APFL_TOK_NUMBER, .position = position, .number = (apfl_number)number, }; } static enum apfl_parse_result non_decimal_number( apfl_tokenizer_ptr tokenizer, bool need, struct apfl_position position, bool negative, int shift, int (*byte_to_digit)(char)) { struct apfl_position last_pos; bool no_digits_yet = true; char byte; uint64_t num = 0; for (;;) { last_pos = tokenizer->position; switch (read_byte(tokenizer, &byte, no_digits_yet || need)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; if (no_digits_yet) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_EOF, }; return APFL_PARSE_ERROR; } else { tokenizer->token = build_number_token((double)num, position, negative); return APFL_PARSE_OK; } } int digit = byte_to_digit(byte); if (digit >= 0) { num <<= shift; num |= digit; no_digits_yet = false; continue; } if (no_digits_yet) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_EXPECTED_DIGIT, .position = tokenizer->position, }; return APFL_PARSE_ERROR; } if (is_word_byte(byte)) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, .position = tokenizer->position, .byte = byte, }; return APFL_PARSE_ERROR; } unread_byte(tokenizer, last_pos); tokenizer->token = build_number_token((double)num, position, negative); return APFL_PARSE_OK; } } #define BUILD_NON_DECIMAL_TOKENIZER(name, shift, byte_to_digit) \ static enum apfl_parse_result \ name( \ apfl_tokenizer_ptr tokenizer, \ bool need, \ struct apfl_position position, \ bool negative \ ) { \ return non_decimal_number( \ tokenizer, \ need, \ position, \ negative, \ shift, \ byte_to_digit \ ); \ } BUILD_NON_DECIMAL_TOKENIZER(hex_number, 4, unhex) BUILD_NON_DECIMAL_TOKENIZER(oct_number, 3, unoct) BUILD_NON_DECIMAL_TOKENIZER(bin_number, 1, unbin) static enum apfl_parse_result number( apfl_tokenizer_ptr tokenizer, bool need, struct apfl_position position, char first_digit, bool negative ) { double num = (double)undec(first_digit); double divider = 1; bool first_iteration = true; bool seen_dot = false; struct apfl_position last_pos; for (;; first_iteration = false) { char byte; last_pos = tokenizer->position; switch (read_byte(tokenizer, &byte, need)) { case RR_OK: break; case RR_ERR: return APFL_PARSE_ERROR; case RR_EOF: tokenizer->next_mode = NM_EOF; tokenizer->token = build_number_token(num / divider, position, negative); return APFL_PARSE_OK; } if (first_iteration && first_digit == '0') { switch (byte) { case 'x': case 'X': return hex_number(tokenizer, need, position, negative); case 'b': case 'B': return bin_number(tokenizer, need, position, negative); case 'o': case 'O': return oct_number(tokenizer, need, position, negative); } } int digit = undec(byte); if (digit >= 0) { num *= 10; num += (double)digit; if (seen_dot) { divider *= 10; } continue; } if (byte == '.') { if (seen_dot) { unread_byte(tokenizer, last_pos); tokenizer->token = build_number_token(num / divider, position, negative); return APFL_PARSE_OK; } else { seen_dot = true; continue; } } if (is_word_byte(byte)) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, .position = tokenizer->position, .byte = byte, }; return APFL_PARSE_ERROR; } unread_byte(tokenizer, last_pos); tokenizer->token = build_number_token(num / divider, position, negative); return APFL_PARSE_OK; } } static enum apfl_parse_result token_source_wrap_next(void *opaque, bool need) { return apfl_tokenizer_next(opaque, need); } static struct apfl_token token_source_wrap_get_token(void *opaque) { return apfl_tokenizer_get_token(opaque); } static struct apfl_error token_source_wrap_get_error(void *opaque) { return apfl_tokenizer_get_error(opaque); } struct apfl_parser_token_source apfl_tokenizer_as_token_source(apfl_tokenizer_ptr p) { return (struct apfl_parser_token_source) { .next = token_source_wrap_next, .get_token = token_source_wrap_get_token, .get_error = token_source_wrap_get_error, .opaque = p, }; }