Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion sqlparse/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ def get_tokens(self, text, encoding=None):
try:
text = text.decode('utf-8')
except UnicodeDecodeError:
text = text.decode('unicode-escape')
# Fall back to latin-1 rather than unicode-escape: the
# latter evaluates backslash escapes (\n, \x41, ',
# octal) in the input, so the parsed token stream would
# no longer match the raw bytes the database receives.
text = text.decode('latin-1')
else:
raise TypeError(f"Expected text or file-like object, got {type(text)!r}")

Expand Down
11 changes: 11 additions & 0 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,17 @@ def test_non_ascii():
assert statement._pprint_tree() is None


def test_non_utf8_bytes_preserve_escapes():
# Non-UTF-8 bytes fall back to a single-byte decode and must not have
# backslash escape sequences evaluated (unicode-escape would turn
# "\\x41" into "A" and "\\n" into a newline), which desyncs the parsed
# token stream from the raw bytes the database receives.
s = b"SELECT '\\x41', '\\n' \xff"
stmts = sqlparse.parse(s)
assert len(stmts) == 1
assert str(stmts[0]) == "SELECT '\\x41', '\\n' \xff"


def test_get_real_name():
# issue 369
s = "update a t set t.b=1"
Expand Down
Loading