@@ -78,22 +78,13 @@ def to_bytes(self):
78
78
# subnode_count
79
79
subnode_count = len (self )
80
80
ret += subnode_count .to_bytes (4 , byteorder = 'little' , signed = False )
81
- # val_len
82
- val_len = len (self .val )
81
+
82
+ # Encode the value as UTF-8
83
+ val_bytes = self .val .encode ('utf-8' )
84
+ # val_len (now stores the byte length of the UTF-8 encoded string)
85
+ val_len = len (val_bytes )
83
86
ret += val_len .to_bytes (4 , byteorder = 'little' , signed = False )
84
87
# val
85
- # Latin-1 is an 8-bit character set. The first 128 characters of its
86
- # set are identical to the US ASCII standard. By encoding the string as
87
- # Latin-1, we can handle all hex characters from \u0000 to \u00ff
88
- # Refs:
89
- # - https://stackoverflow.com/questions/66601743/python3-str-to-bytes-convertation-problem
90
- # - https://kb.iu.edu/d/aepu
91
- val_bytes = bytes (self .val , 'latin-1' )
92
- if val_len != len (val_bytes ):
93
- print (f'The length of `val` should be { val_len } , but found { len (val_bytes )} .' )
94
- print (f'`val` bytes in UTF-8 encoding: { val_bytes } ' )
95
- print ('Please check your grammar file!' )
96
- sys .exit (1 )
97
88
ret += val_bytes
98
89
99
90
# subnodes
@@ -103,6 +94,7 @@ def to_bytes(self):
103
94
return ret
104
95
105
96
@staticmethod
97
+
106
98
def from_bytes (data : bytes ):
107
99
node = TreeNode ()
108
100
consumed = 0
@@ -133,6 +125,7 @@ def from_bytes(data: bytes):
133
125
134
126
return node , consumed
135
127
128
+
136
129
def __str__ (self ):
137
130
ret = ''
138
131
if len (self ) == 0 :
0 commit comments