Tokenizer: Disallow ASCII control characters outside strings

This commit is contained in:
Laria 2022-01-07 23:39:06 +01:00
parent 4eea93ff97
commit 6439f4f8ce
4 changed files with 66 additions and 2 deletions

View file

@ -142,6 +142,7 @@ enum apfl_error_type {
APFL_ERR_INPUT_ERROR,
APFL_ERR_UNEXPECTED_EOF,
APFL_ERR_EXPECTED_EQ_AFTER_COLON,
APFL_ERR_UNEXPECTED_BYTE,
APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER,
APFL_ERR_EXPECTED_DIGIT,
APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE,

View file

@ -18,6 +18,8 @@ apfl_error_type_name(enum apfl_error_type type)
return "APFL_ERR_UNEXPECTED_EOF";
case APFL_ERR_EXPECTED_EQ_AFTER_COLON:
return "APFL_ERR_EXPECTED_EQ_AFTER_COLON";
case APFL_ERR_UNEXPECTED_BYTE:
return "APFL_ERR_UNEXPECTED_BYTE";
case APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER:
return "APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER";
case APFL_ERR_EXPECTED_DIGIT:
@ -67,6 +69,9 @@ apfl_error_print(struct apfl_error error, FILE *file)
case APFL_ERR_EXPECTED_EQ_AFTER_COLON:
fprintf(file, "Expected '=' after ':' at " POSFMT "\n", POSARGS);
return;
case APFL_ERR_UNEXPECTED_BYTE:
fprintf(file, "Unexpected byte '%c' (0x%X) at " POSFMT "\n", error.byte, (unsigned)error.byte, POSARGS);
return;
case APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER:
fprintf(file, "Unexpected byte '%c' while parsing number at " POSFMT "\n", error.byte, POSARGS);
return;

View file

@ -158,6 +158,12 @@ static enum apfl_parse_result string(apfl_tokenizer_ptr);
static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, char);
static enum apfl_parse_result number(apfl_tokenizer_ptr, bool, struct apfl_position, char, bool);
static bool
is_control_byte(unsigned char byte)
{
return byte < 0x20 || byte == 0x7F;
}
enum apfl_parse_result
apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
{
@ -233,10 +239,19 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need)
// Skip whitespace
break;
default:
if (isdigit(byte))
if (is_control_byte(byte)) {
// Disallow ASCII control characters here
tokenizer->error = (struct apfl_error) {
.type = APFL_ERR_UNEXPECTED_BYTE,
.position = tokenizer->position,
.byte = byte,
};
return APFL_PARSE_ERROR;
} else if (isdigit(byte)) {
return number(tokenizer, need, tokenizer->position, byte, false);
else
} else {
return maybe_name(tokenizer, need, byte);
}
}
}
}
@ -689,6 +704,12 @@ maybe_name_inner(
break;
default:
if (is_control_byte(byte)) {
// Disallow ASCII control characters in names
unread_byte(tokenizer, last_pos);
return finalize_maybe_name(tokenizer, text, pos);
}
if (isdigit(byte) && last_byte == '-') {
text->len--; // This removes the '-' from the end of text

View file

@ -147,6 +147,30 @@ expect_number_token(struct tokenizer_test *tt, int line, int col, enum apfl_toke
}
}
static void
expect_error(struct tokenizer_test *tt, enum apfl_error_type want)
{
struct apfl_token tok;
switch (apfl_tokenizer_next(tt->tokenizer, false)) {
case APFL_PARSE_OK:
tok = apfl_tokenizer_get_token(tt->tokenizer);
test_failf(tt->t, "Expected error, got token of type %s instead", apfl_token_type_name(tok.type));
apfl_token_deinit(&tok);
return;
case APFL_PARSE_EOF:
test_fatalf(tt->t, "Got an EOF instead of a token");
break;
case APFL_PARSE_ERROR:
break;
}
struct apfl_error have = apfl_tokenizer_get_error(tt->tokenizer);
if (have.type != want) {
test_failf(tt->t, "Expected error of type %s, got %s instead", apfl_error_type_name(want), apfl_error_type_name(have.type));
}
}
TEST(empty, t) {
struct tokenizer_test *tt = new_tokenizer_test(t, "");
@ -283,6 +307,18 @@ TEST(strings_with_binary_data, t) {
destroy_tokenizer_test(tt);
}
TEST(err_invalid_bytes, t) {
struct tokenizer_test *tt = new_tokenizer_test(t, "\x05" "foo\x01_bar\x7F" "baz");
expect_error(tt, APFL_ERR_UNEXPECTED_BYTE);
expect_text_token(tt, 1, 2, APFL_TOK_NAME, "foo");
expect_error(tt, APFL_ERR_UNEXPECTED_BYTE);
expect_text_token(tt, 1, 6, APFL_TOK_NAME, "_bar");
expect_error(tt, APFL_ERR_UNEXPECTED_BYTE);
expect_text_token(tt, 1, 11, APFL_TOK_NAME, "baz");
expect_eof(tt);
destroy_tokenizer_test(tt);
}
TESTS_BEGIN
ADDTEST(empty),
ADDTEST(simple_variable),
@ -291,4 +327,5 @@ TESTS_BEGIN
ADDTEST(assignment),
ADDTEST(all_tokens),
ADDTEST(strings_with_binary_data),
ADDTEST(err_invalid_bytes),
TESTS_END