Skip to content

Commit

Permalink
fix unicode handling in python constants
Browse files Browse the repository at this point in the history
Summary: Fix for bug where unicode strings are rendered with octal-escaped characters, which basically makes them incorrectly encoded bytes. This diff is a cleaner version of fix for thrift-python that doesn't require using `bytes.decode`; instead, we keep hex-encoding of non-ascii characters in unicode string literals.

Reviewed By: yoney

Differential Revision: D68454598

fbshipit-source-id: 20fe46c4c7d615d97f9ea487c8a14659c5ee3e4d
  • Loading branch information
ahilger authored and facebook-github-bot committed Jan 22, 2025
1 parent 2e4530a commit a5206e8
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 5 deletions.
14 changes: 13 additions & 1 deletion third-party/thrift/src/thrift/compiler/generate/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,17 @@ inline bool generate_legacy_api(const t_service&) {
return true;
}

enum class nonascii_handling { octal_escape, no_escape };

/**
* For languages that don't support unicode with hex-escaped code points,
* default to octal escapes for non-ascii characters. Some
* languages (e.g., Python) prefer hex-escaped unicode code points in unicode
* literals.
*
* In any case, use octal escapes for certain non-printable characters like DEL.
*/
template <nonascii_handling Handling = nonascii_handling::octal_escape>
inline std::string get_escaped_string(std::string_view str) {
std::string escaped;
escaped.reserve(str.size());
Expand All @@ -77,7 +88,8 @@ inline std::string get_escaped_string(std::string_view str) {
escaped.append("\\077");
break;
default:
if (c < 0x20 || c >= 0x7F) {
if ((c < 0x20 || c == 0x7F) ||
(Handling == nonascii_handling::octal_escape && c > 0x7F)) {
// Use octal escape sequences because they are the most portable
// across languages. Hexadecimal ones have a problem of consuming
// all hex digits after \x in C++, e.g. \xcafefe is a single escape
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,7 @@ class py3_mstch_const_value : public mstch_const_value {
{"value:value_for_floating_point?",
&py3_mstch_const_value::is_float_value},
{"value:py3_binary?", &py3_mstch_const_value::is_binary},
{"value:unicode_value", &py3_mstch_const_value::unicode_value},
{"value:const_enum_type", &py3_mstch_const_value::const_enum_type},
{"value:py3_enum_value_name",
&py3_mstch_const_value::py3_enum_value_name},
Expand Down Expand Up @@ -1162,6 +1163,20 @@ class py3_mstch_const_value : public mstch_const_value {
ttype->get_true_type()->is_binary();
}

/*
* Use this function (instead of the version used by C++) to render unicode
* strings, i.e., normal python strings "".
* For binary bytes b"", use string_value, which has octal escapes for unicode
* characters.
*/
mstch::node unicode_value() {
if (type_ != cv::CV_STRING) {
return {};
}
return get_escaped_string<nonascii_handling::no_escape>(
const_value_->get_string());
}

mstch::node py3_enum_value_name() {
if (!const_value_->is_enum() || const_value_->get_enum_value() == nullptr) {
return mstch::node();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ file.
}}{{/value:const_struct?}}{{!
}}{{^value:const_struct?}}{{!
}}{{#value:string?}}{{!
}}{{#value:py3_binary?}}b{{/value:py3_binary?}}"{{value:string_value}}"{{!
}}{{#value:py3_binary?}}b"{{value:string_value}}"{{/value:py3_binary?}}{{!
}}{{^value:py3_binary?}}"{{value:unicode_value}}"{{/value:py3_binary?}}{{!
}}{{/value:string?}}{{!
}}{{#value:map?}}{{!
}}{{#value:const_container_type}}{{type:flat_name}}( { {{! open brace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@
backslash = "\\"
escaped_a = "a"
char2ascii = Map__string_i32( { "'": 39, "\"": 34, "\\": 92, "a": 97 })
escaped_strings = List__string(("\001", "\037", " ", "'", "\"", "\n", "\r", "\011", "a", "\302\253", "j", "\302\246", "ayyy", "\302\253yyy", "jyyy", "\302\246yyy", "zzza", "zzz\302\253", "zzzj", "zzz\302\246", "zzzayyy", "zzz\302\253yyy", "zzzjyyy", "zzz\302\246yyy", ))
unicode_list = List__string(("Bulgaria", "Benin", "Saint Barth\303\251lemy", ))
escaped_strings = List__string(("\001", "\037", " ", "'", "\"", "\n", "\r", "\011", "a", "«", "j", "¦", "ayyy", "«yyy", "jyyy", "¦yyy", "zzza", "zzz«", "zzzj", "zzz¦", "zzzayyy", "zzz«yyy", "zzzjyyy", "zzz¦yyy", ))
unicode_list = List__string(("Bulgaria", "Benin", "Saint Barthélemy", ))
false_c = False
true_c = True
zero_byte = 0
Expand All @@ -86,7 +86,7 @@
empty_int_string_map = Map__i32_string( { })
empty_string_int_map = Map__string_i32( { })
empty_string_string_map = Map__string_string( { })
unicode_map = Map__string_string( { "BG": "Bulgaria", "BH": "Bahrain", "B\303\211": "Saint Barth\303\251lemy" })
unicode_map = Map__string_string( { "BG": "Bulgaria", "BH": "Bahrain", "": "Saint Barthélemy" })
maxIntDec = 9223372036854775807
maxIntOct = 9223372036854775807
maxIntHex = 9223372036854775807
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import unittest

from testing.types import (
binary_list,
Color,
ColorGroups,
easy,
Expand All @@ -31,6 +32,7 @@
StringList,
StrList2D,
Uint32List,
unicode_list,
)
from thrift.lib.py3.test.auto_migrate.auto_migrate_util import (
brokenInAutoMigrate,
Expand Down Expand Up @@ -61,6 +63,10 @@ def test_list_of_None(self) -> None:
# but got `List[None]`.
List__i32([None, None, None])

def test_string_const(self) -> None:
self.assertEqual(unicode_list, ["Bulgaria", "Benin", "Saint Barthélemy"])
self.assertEqual(binary_list, [b"Saint Barth\xc3\xa9lemy"])

def test_isinstance(self) -> None:
base_cls = PythonList if is_auto_migrated() else Py3List
self.assertIsInstance(List__i32(range(5)), base_cls)
Expand Down
2 changes: 2 additions & 0 deletions third-party/thrift/src/thrift/lib/py3/test/testing.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ namespace py3 ""
namespace cpp2 cpp2

const list<i16> int_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
const list<string> unicode_list = ["Bulgaria", "Benin", "Saint Barthélemy"];
const list<binary> binary_list = ["Saint Barthélemy"];

const map<i16, map<i16, i16>> LocationMap = {1: {1: 1}};

Expand Down

0 comments on commit a5206e8

Please sign in to comment.