diff --git a/src/numparse.c b/src/numparse.c index ff29690..f55b1f4 100644 --- a/src/numparse.c +++ b/src/numparse.c @@ -4,8 +4,8 @@ #include "parsing.h" -static int -byte_to_digit(unsigned char b) +int +apfl_parse_digit(unsigned char b) { switch (b) { case '0': return 0; @@ -84,7 +84,7 @@ apfl_parse_number( continue; } - int digit = byte_to_digit(b); + int digit = apfl_parse_digit(b); if (digit < 0 || (unsigned)digit >= base) { unread_last(opaque); goto finalize; diff --git a/src/parsing.h b/src/parsing.h index 91fbf27..3702b3b 100644 --- a/src/parsing.h +++ b/src/parsing.h @@ -15,6 +15,8 @@ enum read_result { RR_EOF, }; +int apfl_parse_digit(unsigned char b); + bool apfl_parse_number( unsigned base, enum read_result (*read)(void *, unsigned char *), diff --git a/src/tokenizer.c b/src/tokenizer.c index f047072..77a689c 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -14,6 +14,7 @@ #define BUFSIZE 4096 typedef int buf_offset; static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset"); +static_assert(BUFSIZE >= 2, "BUFSIZE must be at least 2"); struct apfl_tokenizer { struct apfl_allocator allocator; @@ -24,17 +25,16 @@ struct apfl_tokenizer { enum { NM_REGULAR, - NM_NEGATIVE_NUMBER, NM_MAPSTO, NM_ASSIGN, NM_EOF, } next_mode; struct apfl_position pos_for_mapsto; - unsigned char first_digit_for_negative_number; struct apfl_position position; struct apfl_position last_position; bool last_byte_was_linebreak; + bool prev_last_byte_was_linebreak; union { struct apfl_token token; @@ -66,6 +66,7 @@ apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_source_reader so .col = 0, // The first character was not yet read }; tokenizer->last_byte_was_linebreak = false; + tokenizer->prev_last_byte_was_linebreak = false; tokenizer->next_mode = NM_REGULAR; @@ -99,23 +100,30 @@ static enum read_result read_byte(apfl_tokenizer_ptr tokenizer, unsigned char *byte, bool need) { if (tokenizer->buf_pos >= tokenizer->buf_len) { - size_t len = BUFSIZE; + size_t off = 0; + if (tokenizer->buf_len > 0) { + off = 1; + tokenizer->buf[0] = tokenizer->buf[tokenizer->buf_len - 1]; + } - tokenizer->buf_pos = 0; - tokenizer->buf_len = 0; + size_t len = BUFSIZE - off; - if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf, &len, need)) { + tokenizer->buf_pos = off; + tokenizer->buf_len = off; + + if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf+off, &len, need)) { tokenizer->error.type = APFL_ERR_INPUT_ERROR; return RR_ERR; } - tokenizer->buf_len = len; + tokenizer->buf_len = len + off; if (len == 0) { return RR_EOF; } } + tokenizer->prev_last_byte_was_linebreak = tokenizer->last_byte_was_linebreak; tokenizer->last_position = tokenizer->position; if (tokenizer->last_byte_was_linebreak) { @@ -137,8 +145,10 @@ static void unread_byte(apfl_tokenizer_ptr tokenizer) { tokenizer->position = tokenizer->last_position; + tokenizer->last_byte_was_linebreak = tokenizer->prev_last_byte_was_linebreak; + + assert(tokenizer->buf_pos > 0); tokenizer->buf_pos--; - tokenizer->last_byte_was_linebreak = false; } static enum apfl_parse_result @@ -156,7 +166,8 @@ static enum apfl_parse_result comment(apfl_tokenizer_ptr); static enum apfl_parse_result colon(apfl_tokenizer_ptr); static enum apfl_parse_result string(apfl_tokenizer_ptr); static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char); -static enum apfl_parse_result number(apfl_tokenizer_ptr, bool, struct apfl_position, unsigned char, bool); +static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool); +static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool); static bool is_control_byte(unsigned char byte) @@ -164,6 +175,47 @@ is_control_byte(unsigned char byte) return byte < 0x20 || byte == 0x7F; } +static enum apfl_parse_result +minus(apfl_tokenizer_ptr tokenizer) +{ + struct apfl_position pos = tokenizer->position; + + unsigned char byte; + switch (read_byte(tokenizer, &byte, true)) { + case RR_OK: + break; + case RR_ERR: + return APFL_PARSE_ERROR; + case RR_EOF: + tokenizer->next_mode = NM_EOF; + struct apfl_string str = apfl_string_blank(); + if (!apfl_string_copy(tokenizer->allocator, &str, apfl_string_view_from("-"))) { + tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED); + return APFL_PARSE_ERROR; + } + tokenizer->token = (struct apfl_token) { + .type = APFL_TOK_NAME, + .position = pos, + .text = str, + }; + return APFL_PARSE_OK; + } + + switch (byte) { + case '0': + return zero(tokenizer, pos, true); + case '>': + return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, pos); + default: + unread_byte(tokenizer); + if (isdigit(byte)) { + return number(tokenizer, 10, pos, true); + } else { + return maybe_name(tokenizer, true, '-'); + } + } +} + enum apfl_parse_result apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) { @@ -173,9 +225,6 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) case NM_MAPSTO: tokenizer->next_mode = NM_REGULAR; return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto); - case NM_NEGATIVE_NUMBER: - tokenizer->next_mode = NM_REGULAR; - return number(tokenizer, need, tokenizer->position, tokenizer->first_digit_for_negative_number, true); case NM_ASSIGN: tokenizer->next_mode = NM_REGULAR; return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position); @@ -233,11 +282,15 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) return colon(tokenizer); case '"': return string(tokenizer); + case '-': + return minus(tokenizer); case ' ': case '\r': case '\t': // Skip whitespace break; + case '0': + return zero(tokenizer, tokenizer->position, false); default: if (is_control_byte(byte)) { // Disallow ASCII control characters here @@ -248,7 +301,9 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) }; return APFL_PARSE_ERROR; } else if (isdigit(byte)) { - return number(tokenizer, need, tokenizer->position, byte, false); + struct apfl_position position = tokenizer->position; + unread_byte(tokenizer); + return number(tokenizer, 10, position, false); } else { return maybe_name(tokenizer, need, byte); } @@ -339,120 +394,6 @@ append_single_byte( return APFL_PARSE_OK; } -static int -unhex(unsigned char byte) -{ - switch (byte) { - case '0': - return 0x0; - case '1': - return 0x1; - case '2': - return 0x2; - case '3': - return 0x3; - case '4': - return 0x4; - case '5': - return 0x5; - case '6': - return 0x6; - case '7': - return 0x7; - case '8': - return 0x8; - case '9': - return 0x9; - case 'a': - case 'A': - return 0xA; - case 'b': - case 'B': - return 0xB; - case 'c': - case 'C': - return 0xC; - case 'd': - case 'D': - return 0xD; - case 'e': - case 'E': - return 0xE; - case 'f': - case 'F': - return 0xF; - } - - return -1; -} - -static int -undec(unsigned char byte) -{ - switch (byte) { - case '0': - return 0; - case '1': - return 1; - case '2': - return 2; - case '3': - return 3; - case '4': - return 4; - case '5': - return 5; - case '6': - return 6; - case '7': - return 7; - case '8': - return 8; - case '9': - return 9; - } - - return -1; -} - -static int -unoct(unsigned char byte) -{ - switch (byte) { - case '0': - return 0; - case '1': - return 1; - case '2': - return 2; - case '3': - return 3; - case '4': - return 4; - case '5': - return 5; - case '6': - return 6; - case '7': - return 7; - } - - return -1; -} - -static int -unbin(unsigned char byte) -{ - switch (byte) { - case '0': - return 0; - case '1': - return 1; - } - - return -1; -} - static enum apfl_parse_result hex_escape( apfl_tokenizer_ptr tokenizer, @@ -474,8 +415,8 @@ hex_escape( return APFL_PARSE_ERROR; } - int nibble = unhex(byte); - if (nibble < 0) { + int nibble = apfl_parse_digit(byte); + if (nibble < 0 || nibble > 0xF) { tokenizer->error = (struct apfl_error) { .type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE, .position = tokenizer->position, @@ -703,18 +644,6 @@ maybe_name_inner( return finalize_maybe_name(tokenizer, text, pos); } - if (isdigit(byte) && last_byte == '-') { - text->len--; // This removes the '-' from the end of text - - if (text->len == 0) { - return number(tokenizer, need, pos, byte, true); - } - - tokenizer->next_mode = NM_NEGATIVE_NUMBER; - tokenizer->first_digit_for_negative_number = byte; - return finalize_maybe_name(tokenizer, text, pos); - } - break; } @@ -752,169 +681,88 @@ build_number_token(double number, struct apfl_position position, bool negative) } static enum apfl_parse_result -non_decimal_number( - apfl_tokenizer_ptr tokenizer, - bool need, - struct apfl_position position, - bool negative, - int shift, - int (*byte_to_digit)(unsigned char)) +zero(apfl_tokenizer_ptr tokenizer, struct apfl_position position, bool negative) { - bool no_digits_yet = true; unsigned char byte; - - uint64_t num = 0; - - for (;;) { - switch (read_byte(tokenizer, &byte, no_digits_yet || need)) { - case RR_OK: - break; - case RR_ERR: - return APFL_PARSE_ERROR; - case RR_EOF: - tokenizer->next_mode = NM_EOF; - if (no_digits_yet) { - tokenizer->error = (struct apfl_error) { - .type = APFL_ERR_UNEXPECTED_EOF, - }; - return APFL_PARSE_ERROR; - } else { - tokenizer->token = build_number_token((double)num, position, negative); - return APFL_PARSE_OK; - } - } - - int digit = byte_to_digit(byte); - if (digit >= 0) { - num <<= shift; - num |= digit; - - no_digits_yet = false; - continue; - } - - if (no_digits_yet) { - tokenizer->error = (struct apfl_error) { - .type = APFL_ERR_EXPECTED_DIGIT, - .position = tokenizer->position, - }; - return APFL_PARSE_ERROR; - } - - if (is_word_byte(byte)) { - tokenizer->error = (struct apfl_error) { - .type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, - .position = tokenizer->position, - .byte = byte, - }; - return APFL_PARSE_ERROR; - } - - unread_byte(tokenizer); - tokenizer->token = build_number_token((double)num, position, negative); + switch (read_byte(tokenizer, &byte, true)) { + case RR_OK: + break; + case RR_ERR: + return APFL_PARSE_ERROR; + case RR_EOF: + tokenizer->next_mode = NM_EOF; + tokenizer->token = build_number_token(0, position, negative); return APFL_PARSE_OK; } + + switch (byte) { + case 'x': + case 'X': + return number(tokenizer, 16, position, negative); + case 'o': + case 'O': + return number(tokenizer, 8, position, negative); + case 'b': + case 'B': + return number(tokenizer, 2, position, negative); + default: + unread_byte(tokenizer); + return number(tokenizer, 10, position, negative); + } } -#define BUILD_NON_DECIMAL_TOKENIZER(name, shift, byte_to_digit) \ - static enum apfl_parse_result \ - name( \ - apfl_tokenizer_ptr tokenizer, \ - bool need, \ - struct apfl_position position, \ - bool negative \ - ) { \ - return non_decimal_number( \ - tokenizer, \ - need, \ - position, \ - negative, \ - shift, \ - byte_to_digit \ - ); \ - } +static enum read_result +read_for_parse_number(void *opaque, unsigned char *byte) +{ + apfl_tokenizer_ptr tokenizer = opaque; + return read_byte(tokenizer, byte, true); +} -BUILD_NON_DECIMAL_TOKENIZER(hex_number, 4, unhex) -BUILD_NON_DECIMAL_TOKENIZER(oct_number, 3, unoct) -BUILD_NON_DECIMAL_TOKENIZER(bin_number, 1, unbin) +static void +unread_for_parse_number(void *opaque) +{ + apfl_tokenizer_ptr tokenizer = opaque; + unread_byte(tokenizer); +} static enum apfl_parse_result -number( - apfl_tokenizer_ptr tokenizer, - bool need, - struct apfl_position position, - unsigned char first_digit, - bool negative -) { - double num = (double)undec(first_digit); - double divider = 1; - bool first_iteration = true; - bool seen_dot = false; +number(apfl_tokenizer_ptr tokenizer, unsigned base, struct apfl_position pos, bool negative) +{ + apfl_number num; + if (!apfl_parse_number( + base, + read_for_parse_number, + unread_for_parse_number, + tokenizer, + &num + )) { + return APFL_PARSE_ERROR; + } - for (;; first_iteration = false) { - unsigned char byte; - - switch (read_byte(tokenizer, &byte, need)) { - case RR_OK: - break; - case RR_ERR: - return APFL_PARSE_ERROR; - case RR_EOF: - tokenizer->next_mode = NM_EOF; - tokenizer->token = build_number_token(num / divider, position, negative); - return APFL_PARSE_OK; - } - - if (first_iteration && first_digit == '0') { - switch (byte) { - case 'x': - case 'X': - return hex_number(tokenizer, need, position, negative); - case 'b': - case 'B': - return bin_number(tokenizer, need, position, negative); - case 'o': - case 'O': - return oct_number(tokenizer, need, position, negative); - } - } - - int digit = undec(byte); - if (digit >= 0) { - num *= 10; - num += (double)digit; - - if (seen_dot) { - divider *= 10; - } - - continue; - } - - if (byte == '.') { - if (seen_dot) { - unread_byte(tokenizer); - tokenizer->token = build_number_token(num / divider, position, negative); - return APFL_PARSE_OK; - } else { - seen_dot = true; - continue; - } - } - - if (is_word_byte(byte)) { - tokenizer->error = (struct apfl_error) { - .type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, - .position = tokenizer->position, - .byte = byte, - }; - return APFL_PARSE_ERROR; - } - - unread_byte(tokenizer); - tokenizer->token = build_number_token(num / divider, position, negative); + unsigned char byte; + switch (read_byte(tokenizer, &byte, false)) { + case RR_OK: + break; + case RR_ERR: + return APFL_PARSE_ERROR; + case RR_EOF: + tokenizer->next_mode = NM_EOF; + tokenizer->token = build_number_token(num, pos, negative); return APFL_PARSE_OK; } + + if (is_word_byte(byte)) { + tokenizer->error = (struct apfl_error) { + .type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, + .position = tokenizer->position, + .byte = byte, + }; + return APFL_PARSE_ERROR; + } + + unread_byte(tokenizer); + tokenizer->token = build_number_token(num, pos, negative); + return APFL_PARSE_OK; } static enum apfl_parse_result