diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 966cdb2c..c7fe50cc 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -128,7 +128,11 @@ def get_tokens(self, text, encoding=None): try: text = text.decode('utf-8') except UnicodeDecodeError: - text = text.decode('unicode-escape') + # Fall back to latin-1 rather than unicode-escape: the + # latter evaluates backslash escapes (\n, \x41, ', + # octal) in the input, so the parsed token stream would + # no longer match the raw bytes the database receives. + text = text.decode('latin-1') else: raise TypeError(f"Expected text or file-like object, got {type(text)!r}") diff --git a/tests/test_parse.py b/tests/test_parse.py index 34800cb7..3169dfd2 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -457,6 +457,17 @@ def test_non_ascii(): assert statement._pprint_tree() is None +def test_non_utf8_bytes_preserve_escapes(): + # Non-UTF-8 bytes fall back to a single-byte decode and must not have + # backslash escape sequences evaluated (unicode-escape would turn + # "\\x41" into "A" and "\\n" into a newline), which desyncs the parsed + # token stream from the raw bytes the database receives. + s = b"SELECT '\\x41', '\\n' \xff" + stmts = sqlparse.parse(s) + assert len(stmts) == 1 + assert str(stmts[0]) == "SELECT '\\x41', '\\n' \xff" + + def test_get_real_name(): # issue 369 s = "update a t set t.b=1"