tokenizer: Use same number parsing code as in tonumber

This commit is contained in:
Laria 2023-09-04 14:04:40 +02:00
parent 0209fe6a51
commit 387b4bfd99
3 changed files with 145 additions and 295 deletions

View file

@ -4,8 +4,8 @@
#include "parsing.h"
static int
byte_to_digit(unsigned char b)
int
apfl_parse_digit(unsigned char b)
{
switch (b) {
case '0': return 0;
@ -84,7 +84,7 @@ apfl_parse_number(
continue;
}
int digit = byte_to_digit(b);
int digit = apfl_parse_digit(b);
if (digit < 0 || (unsigned)digit >= base) {
unread_last(opaque);
goto finalize;

View file

@ -15,6 +15,8 @@ enum read_result {
RR_EOF,
};
int apfl_parse_digit(unsigned char b);
bool apfl_parse_number(
unsigned base,
enum read_result (*read)(void *, unsigned char *),

View file

@ -14,6 +14,7 @@
#define BUFSIZE 4096
typedef int buf_offset;
static_assert(INT_MAX >= BUFSIZE, "BUFSIZE is too large for type buf_offset");
static_assert(BUFSIZE >= 2, "BUFSIZE must be at least 2");
struct apfl_tokenizer {
struct apfl_allocator allocator;
@ -24,17 +25,16 @@ struct apfl_tokenizer {
enum {
NM_REGULAR,
NM_NEGATIVE_NUMBER,
NM_MAPSTO,
NM_ASSIGN,
NM_EOF,
} next_mode;
struct apfl_position pos_for_mapsto;
unsigned char first_digit_for_negative_number;
struct apfl_position position;
struct apfl_position last_position;
bool last_byte_was_linebreak;
bool prev_last_byte_was_linebreak;
union {
struct apfl_token token;
@ -66,6 +66,7 @@ apfl_tokenizer_new(struct apfl_allocator allocator, struct apfl_source_reader so
.col = 0, // The first character was not yet read
};
tokenizer->last_byte_was_linebreak = false;
tokenizer->prev_last_byte_was_linebreak = false;
tokenizer->next_mode = NM_REGULAR;
@ -99,23 +100,30 @@ static enum read_result
read_byte(apfl_tokenizer_ptr tokenizer, unsigned char *byte, bool need)
{
if (tokenizer->buf_pos >= tokenizer->buf_len) {
size_t len = BUFSIZE;
size_t off = 0;
if (tokenizer->buf_len > 0) {
off = 1;
tokenizer->buf[0] = tokenizer->buf[tokenizer->buf_len - 1];
}
tokenizer->buf_pos = 0;
tokenizer->buf_len = 0;
size_t len = BUFSIZE - off;
if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf, &len, need)) {
tokenizer->buf_pos = off;
tokenizer->buf_len = off;
if (!tokenizer->source_reader.callback(tokenizer->source_reader.opaque, tokenizer->buf+off, &len, need)) {
tokenizer->error.type = APFL_ERR_INPUT_ERROR;
return RR_ERR;
}
tokenizer->buf_len = len;
tokenizer->buf_len = len + off;
if (len == 0) {
return RR_EOF;
}
}
tokenizer->prev_last_byte_was_linebreak = tokenizer->last_byte_was_linebreak;
tokenizer->last_position = tokenizer->position;
if (tokenizer->last_byte_was_linebreak) {
@ -137,8 +145,10 @@ static void
unread_byte(apfl_tokenizer_ptr tokenizer)
{
tokenizer->position = tokenizer->last_position;
tokenizer->last_byte_was_linebreak = tokenizer->prev_last_byte_was_linebreak;
assert(tokenizer->buf_pos > 0);
tokenizer->buf_pos--;
tokenizer->last_byte_was_linebreak = false;
}
static enum apfl_parse_result
@ -156,7 +166,8 @@ static enum apfl_parse_result comment(apfl_tokenizer_ptr);
static enum apfl_parse_result colon(apfl_tokenizer_ptr);
static enum apfl_parse_result string(apfl_tokenizer_ptr);
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, unsigned char);
static enum apfl_parse_result number(apfl_tokenizer_ptr, bool, struct apfl_position, unsigned char, bool);
static enum apfl_parse_result number(apfl_tokenizer_ptr, unsigned, struct apfl_position, bool);
static enum apfl_parse_result zero(apfl_tokenizer_ptr, struct apfl_position, bool);
static bool
is_control_byte(unsigned char byte)
@ -164,6 +175,47 @@ is_control_byte(unsigned char byte)
return byte < 0x20 || byte == 0x7F;
}
static enum apfl_parse_result
minus(apfl_tokenizer_ptr tokenizer)
{
struct apfl_position pos = tokenizer->position;
unsigned char byte;
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
struct apfl_string str = apfl_string_blank();
if (!apfl_string_copy(tokenizer->allocator, &str, apfl_string_view_from("-"))) {
tokenizer->error = apfl_error_simple(APFL_ERR_MALLOC_FAILED);
return APFL_PARSE_ERROR;
}
tokenizer->token = (struct apfl_token) {
.type = APFL_TOK_NAME,
.position = pos,
.text = str,
};
return APFL_PARSE_OK;
}
switch (byte) {
case '0':
return zero(tokenizer, pos, true);
case '>':
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, pos);
default:
unread_byte(tokenizer);
if (isdigit(byte)) {
return number(tokenizer, 10, pos, true);
} else {
return maybe_name(tokenizer, true, '-');
}
}
}
enum apfl_parse_result
apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
{
@ -173,9 +225,6 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
case NM_MAPSTO:
tokenizer->next_mode = NM_REGULAR;
return yield_simple_token(tokenizer, APFL_TOK_MAPSTO, tokenizer->pos_for_mapsto);
case NM_NEGATIVE_NUMBER:
tokenizer->next_mode = NM_REGULAR;
return number(tokenizer, need, tokenizer->position, tokenizer->first_digit_for_negative_number, true);
case NM_ASSIGN:
tokenizer->next_mode = NM_REGULAR;
return yield_simple_token(tokenizer, APFL_TOK_ASSIGN, tokenizer->position);
@ -233,11 +282,15 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
return colon(tokenizer);
case '"':
return string(tokenizer);
case '-':
return minus(tokenizer);
case ' ':
case '\r':
case '\t':
// Skip whitespace
break;
case '0':
return zero(tokenizer, tokenizer->position, false);
default:
if (is_control_byte(byte)) {
// Disallow ASCII control characters here
@ -248,7 +301,9 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
};
return APFL_PARSE_ERROR;
} else if (isdigit(byte)) {
return number(tokenizer, need, tokenizer->position, byte, false);
struct apfl_position position = tokenizer->position;
unread_byte(tokenizer);
return number(tokenizer, 10, position, false);
} else {
return maybe_name(tokenizer, need, byte);
}
@ -339,120 +394,6 @@ append_single_byte(
return APFL_PARSE_OK;
}
static int
unhex(unsigned char byte)
{
switch (byte) {
case '0':
return 0x0;
case '1':
return 0x1;
case '2':
return 0x2;
case '3':
return 0x3;
case '4':
return 0x4;
case '5':
return 0x5;
case '6':
return 0x6;
case '7':
return 0x7;
case '8':
return 0x8;
case '9':
return 0x9;
case 'a':
case 'A':
return 0xA;
case 'b':
case 'B':
return 0xB;
case 'c':
case 'C':
return 0xC;
case 'd':
case 'D':
return 0xD;
case 'e':
case 'E':
return 0xE;
case 'f':
case 'F':
return 0xF;
}
return -1;
}
static int
undec(unsigned char byte)
{
switch (byte) {
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
case '8':
return 8;
case '9':
return 9;
}
return -1;
}
static int
unoct(unsigned char byte)
{
switch (byte) {
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
}
return -1;
}
static int
unbin(unsigned char byte)
{
switch (byte) {
case '0':
return 0;
case '1':
return 1;
}
return -1;
}
static enum apfl_parse_result
hex_escape(
apfl_tokenizer_ptr tokenizer,
@ -474,8 +415,8 @@ hex_escape(
return APFL_PARSE_ERROR;
}
int nibble = unhex(byte);
if (nibble < 0) {
int nibble = apfl_parse_digit(byte);
if (nibble < 0 || nibble > 0xF) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE,
.position = tokenizer->position,
@ -703,18 +644,6 @@ maybe_name_inner(
return finalize_maybe_name(tokenizer, text, pos);
}
if (isdigit(byte) && last_byte == '-') {
text->len--; // This removes the '-' from the end of text
if (text->len == 0) {
return number(tokenizer, need, pos, byte, true);
}
tokenizer->next_mode = NM_NEGATIVE_NUMBER;
tokenizer->first_digit_for_negative_number = byte;
return finalize_maybe_name(tokenizer, text, pos);
}
break;
}
@ -752,169 +681,88 @@ build_number_token(double number, struct apfl_position position, bool negative)
}
static enum apfl_parse_result
non_decimal_number(
apfl_tokenizer_ptr tokenizer,
bool need,
struct apfl_position position,
bool negative,
int shift,
int (*byte_to_digit)(unsigned char))
zero(apfl_tokenizer_ptr tokenizer, struct apfl_position position, bool negative)
{
bool no_digits_yet = true;
unsigned char byte;
uint64_t num = 0;
for (;;) {
switch (read_byte(tokenizer, &byte, no_digits_yet || need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
if (no_digits_yet) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_EOF,
};
return APFL_PARSE_ERROR;
} else {
tokenizer->token = build_number_token((double)num, position, negative);
return APFL_PARSE_OK;
}
}
int digit = byte_to_digit(byte);
if (digit >= 0) {
num <<= shift;
num |= digit;
no_digits_yet = false;
continue;
}
if (no_digits_yet) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_EXPECTED_DIGIT,
.position = tokenizer->position,
};
return APFL_PARSE_ERROR;
}
if (is_word_byte(byte)) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
unread_byte(tokenizer);
tokenizer->token = build_number_token((double)num, position, negative);
switch (read_byte(tokenizer, &byte, true)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = build_number_token(0, position, negative);
return APFL_PARSE_OK;
}
switch (byte) {
case 'x':
case 'X':
return number(tokenizer, 16, position, negative);
case 'o':
case 'O':
return number(tokenizer, 8, position, negative);
case 'b':
case 'B':
return number(tokenizer, 2, position, negative);
default:
unread_byte(tokenizer);
return number(tokenizer, 10, position, negative);
}
}
#define BUILD_NON_DECIMAL_TOKENIZER(name, shift, byte_to_digit) \
static enum apfl_parse_result \
name( \
apfl_tokenizer_ptr tokenizer, \
bool need, \
struct apfl_position position, \
bool negative \
) { \
return non_decimal_number( \
tokenizer, \
need, \
position, \
negative, \
shift, \
byte_to_digit \
); \
}
static enum read_result
read_for_parse_number(void *opaque, unsigned char *byte)
{
apfl_tokenizer_ptr tokenizer = opaque;
return read_byte(tokenizer, byte, true);
}
BUILD_NON_DECIMAL_TOKENIZER(hex_number, 4, unhex)
BUILD_NON_DECIMAL_TOKENIZER(oct_number, 3, unoct)
BUILD_NON_DECIMAL_TOKENIZER(bin_number, 1, unbin)
static void
unread_for_parse_number(void *opaque)
{
apfl_tokenizer_ptr tokenizer = opaque;
unread_byte(tokenizer);
}
static enum apfl_parse_result
number(
apfl_tokenizer_ptr tokenizer,
bool need,
struct apfl_position position,
unsigned char first_digit,
bool negative
) {
double num = (double)undec(first_digit);
double divider = 1;
bool first_iteration = true;
bool seen_dot = false;
number(apfl_tokenizer_ptr tokenizer, unsigned base, struct apfl_position pos, bool negative)
{
apfl_number num;
if (!apfl_parse_number(
base,
read_for_parse_number,
unread_for_parse_number,
tokenizer,
&num
)) {
return APFL_PARSE_ERROR;
}
for (;; first_iteration = false) {
unsigned char byte;
switch (read_byte(tokenizer, &byte, need)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = build_number_token(num / divider, position, negative);
return APFL_PARSE_OK;
}
if (first_iteration && first_digit == '0') {
switch (byte) {
case 'x':
case 'X':
return hex_number(tokenizer, need, position, negative);
case 'b':
case 'B':
return bin_number(tokenizer, need, position, negative);
case 'o':
case 'O':
return oct_number(tokenizer, need, position, negative);
}
}
int digit = undec(byte);
if (digit >= 0) {
num *= 10;
num += (double)digit;
if (seen_dot) {
divider *= 10;
}
continue;
}
if (byte == '.') {
if (seen_dot) {
unread_byte(tokenizer);
tokenizer->token = build_number_token(num / divider, position, negative);
return APFL_PARSE_OK;
} else {
seen_dot = true;
continue;
}
}
if (is_word_byte(byte)) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
unread_byte(tokenizer);
tokenizer->token = build_number_token(num / divider, position, negative);
unsigned char byte;
switch (read_byte(tokenizer, &byte, false)) {
case RR_OK:
break;
case RR_ERR:
return APFL_PARSE_ERROR;
case RR_EOF:
tokenizer->next_mode = NM_EOF;
tokenizer->token = build_number_token(num, pos, negative);
return APFL_PARSE_OK;
}
if (is_word_byte(byte)) {
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
}
unread_byte(tokenizer);
tokenizer->token = build_number_token(num, pos, negative);
return APFL_PARSE_OK;
}
static enum apfl_parse_result