From 6439f4f8ce5370e3d57147f5cb037a1e254fee10 Mon Sep 17 00:00:00 2001 From: Laria Carolin Chabowski Date: Fri, 7 Jan 2022 23:39:06 +0100 Subject: [PATCH] Tokenizer: Disallow ASCII control characters outside strings --- src/apfl.h | 1 + src/error.c | 5 +++++ src/tokenizer.c | 25 +++++++++++++++++++++++-- src/tokenizer_test.c | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/apfl.h b/src/apfl.h index 9cb449d..a381b52 100644 --- a/src/apfl.h +++ b/src/apfl.h @@ -142,6 +142,7 @@ enum apfl_error_type { APFL_ERR_INPUT_ERROR, APFL_ERR_UNEXPECTED_EOF, APFL_ERR_EXPECTED_EQ_AFTER_COLON, + APFL_ERR_UNEXPECTED_BYTE, APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER, APFL_ERR_EXPECTED_DIGIT, APFL_ERR_EXPECTED_HEX_IN_HEX_ESCAPE, diff --git a/src/error.c b/src/error.c index 0c92c37..4f5e9e7 100644 --- a/src/error.c +++ b/src/error.c @@ -18,6 +18,8 @@ apfl_error_type_name(enum apfl_error_type type) return "APFL_ERR_UNEXPECTED_EOF"; case APFL_ERR_EXPECTED_EQ_AFTER_COLON: return "APFL_ERR_EXPECTED_EQ_AFTER_COLON"; + case APFL_ERR_UNEXPECTED_BYTE: + return "APFL_ERR_UNEXPECTED_BYTE"; case APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER: return "APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER"; case APFL_ERR_EXPECTED_DIGIT: @@ -67,6 +69,9 @@ apfl_error_print(struct apfl_error error, FILE *file) case APFL_ERR_EXPECTED_EQ_AFTER_COLON: fprintf(file, "Expected '=' after ':' at " POSFMT "\n", POSARGS); return; + case APFL_ERR_UNEXPECTED_BYTE: + fprintf(file, "Unexpected byte '%c' (0x%X) at " POSFMT "\n", error.byte, (unsigned)error.byte, POSARGS); + return; case APFL_ERR_UNEXPECTED_BYTE_IN_NUMBER: fprintf(file, "Unexpected byte '%c' while parsing number at " POSFMT "\n", error.byte, POSARGS); return; diff --git a/src/tokenizer.c b/src/tokenizer.c index 707fd1d..4e503db 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -158,6 +158,12 @@ static enum apfl_parse_result string(apfl_tokenizer_ptr); static enum apfl_parse_result maybe_name(apfl_tokenizer_ptr, bool, char); static enum apfl_parse_result number(apfl_tokenizer_ptr, bool, struct apfl_position, char, bool); +static bool +is_control_byte(unsigned char byte) +{ + return byte < 0x20 || byte == 0x7F; +} + enum apfl_parse_result apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) { @@ -233,10 +239,19 @@ apfl_tokenizer_next(apfl_tokenizer_ptr tokenizer, bool need) // Skip whitespace break; default: - if (isdigit(byte)) + if (is_control_byte(byte)) { + // Disallow ASCII control characters here + tokenizer->error = (struct apfl_error) { + .type = APFL_ERR_UNEXPECTED_BYTE, + .position = tokenizer->position, + .byte = byte, + }; + return APFL_PARSE_ERROR; + } else if (isdigit(byte)) { return number(tokenizer, need, tokenizer->position, byte, false); - else + } else { return maybe_name(tokenizer, need, byte); + } } } } @@ -689,6 +704,12 @@ maybe_name_inner( break; default: + if (is_control_byte(byte)) { + // Disallow ASCII control characters in names + unread_byte(tokenizer, last_pos); + return finalize_maybe_name(tokenizer, text, pos); + } + if (isdigit(byte) && last_byte == '-') { text->len--; // This removes the '-' from the end of text diff --git a/src/tokenizer_test.c b/src/tokenizer_test.c index e6510f5..0dab30f 100644 --- a/src/tokenizer_test.c +++ b/src/tokenizer_test.c @@ -147,6 +147,30 @@ expect_number_token(struct tokenizer_test *tt, int line, int col, enum apfl_toke } } +static void +expect_error(struct tokenizer_test *tt, enum apfl_error_type want) +{ + struct apfl_token tok; + + switch (apfl_tokenizer_next(tt->tokenizer, false)) { + case APFL_PARSE_OK: + tok = apfl_tokenizer_get_token(tt->tokenizer); + test_failf(tt->t, "Expected error, got token of type %s instead", apfl_token_type_name(tok.type)); + apfl_token_deinit(&tok); + return; + case APFL_PARSE_EOF: + test_fatalf(tt->t, "Got an EOF instead of a token"); + break; + case APFL_PARSE_ERROR: + break; + } + + struct apfl_error have = apfl_tokenizer_get_error(tt->tokenizer); + if (have.type != want) { + test_failf(tt->t, "Expected error of type %s, got %s instead", apfl_error_type_name(want), apfl_error_type_name(have.type)); + } +} + TEST(empty, t) { struct tokenizer_test *tt = new_tokenizer_test(t, ""); @@ -283,6 +307,18 @@ TEST(strings_with_binary_data, t) { destroy_tokenizer_test(tt); } +TEST(err_invalid_bytes, t) { + struct tokenizer_test *tt = new_tokenizer_test(t, "\x05" "foo\x01_bar\x7F" "baz"); + expect_error(tt, APFL_ERR_UNEXPECTED_BYTE); + expect_text_token(tt, 1, 2, APFL_TOK_NAME, "foo"); + expect_error(tt, APFL_ERR_UNEXPECTED_BYTE); + expect_text_token(tt, 1, 6, APFL_TOK_NAME, "_bar"); + expect_error(tt, APFL_ERR_UNEXPECTED_BYTE); + expect_text_token(tt, 1, 11, APFL_TOK_NAME, "baz"); + expect_eof(tt); + destroy_tokenizer_test(tt); +} + TESTS_BEGIN ADDTEST(empty), ADDTEST(simple_variable), @@ -291,4 +327,5 @@ TESTS_BEGIN ADDTEST(assignment), ADDTEST(all_tokens), ADDTEST(strings_with_binary_data), + ADDTEST(err_invalid_bytes), TESTS_END