diff --git a/Cython/Build/Tests/TestCythonizeArgsParser.py b/Cython/Build/Tests/TestCythonizeArgsParser.py index 105362dbbec..c2769a00df8 100644 --- a/Cython/Build/Tests/TestCythonizeArgsParser.py +++ b/Cython/Build/Tests/TestCythonizeArgsParser.py @@ -105,8 +105,8 @@ def test_directives_types(self): options, args = self.parse_args(['-X', cmd]) self.assertFalse(args) self.assertTrue(self.are_default(options, ['directives']), msg = "Error for option: "+cmd) - if value == 'str': - value = 'unicode' + if value == 'unicode': + value = 'str' self.assertEqual(options.directives[key], value, msg = "Error for option: "+cmd) def test_directives_wrong(self): diff --git a/Cython/CodeWriter.py b/Cython/CodeWriter.py index c7a3f09a0e8..ea54a4ddf09 100644 --- a/Cython/CodeWriter.py +++ b/Cython/CodeWriter.py @@ -510,11 +510,13 @@ def comma_separated_list(self, items): def visit_Node(self, node): raise AssertionError("Node not handled by serializer: %r" % node) + # TODO: Remove redundancy below. Most constants serialise fine as just "repr(node.value)". + def visit_IntNode(self, node): - self.put(node.value) + self.put(repr(node.value)) def visit_FloatNode(self, node): - self.put(node.value) + self.put(repr(node.value)) def visit_NoneNode(self, node): self.put("None") @@ -526,29 +528,19 @@ def visit_EllipsisNode(self, node): self.put("...") def visit_BoolNode(self, node): - self.put(str(node.value)) + self.put(repr(node.value)) def visit_ConstNode(self, node): - self.put(str(node.value)) + self.put(repr(node.value)) def visit_ImagNode(self, node): - self.put(node.value) - self.put("j") - - def emit_string(self, node, prefix=""): - repr_val = repr(node.value) - if repr_val[0] in 'ub': - repr_val = repr_val[1:] - self.put("%s%s" % (prefix, repr_val)) + self.put(f"{node.value!r}j") def visit_BytesNode(self, node): - self.emit_string(node, "b") - - def visit_StringNode(self, node): - self.emit_string(node) + self.put(repr(node.value)) def visit_UnicodeNode(self, node): - self.emit_string(node, "u") + self.put(repr(node.value)) def emit_sequence(self, node, parens=("", "")): open_paren, close_paren = parens diff --git a/Cython/Compiler/AnalysedTreeTransforms.py b/Cython/Compiler/AnalysedTreeTransforms.py index 0dbaed857cc..7ab54d7ff63 100644 --- a/Cython/Compiler/AnalysedTreeTransforms.py +++ b/Cython/Compiler/AnalysedTreeTransforms.py @@ -49,8 +49,8 @@ def visit_ModuleNode(self, node): def add_test(self, testpos, path, doctest): pos = self.testspos - keystr = '%s (line %d)' % (path, testpos[1]) - key = UnicodeNode(pos, value=EncodedString(keystr)) + keystr = EncodedString(f'{path} (line {testpos[1]:d})') + key = UnicodeNode(pos, value=keystr) value = UnicodeNode(pos, value=doctest) self.tests.append(DictItemNode(pos, key=key, value=value)) diff --git a/Cython/Compiler/AutoDocTransforms.py b/Cython/Compiler/AutoDocTransforms.py index ebd67758f0a..a264f8e21ad 100644 --- a/Cython/Compiler/AutoDocTransforms.py +++ b/Cython/Compiler/AutoDocTransforms.py @@ -38,13 +38,8 @@ def visit_LambdaNode(self, node): "Failed to convert lambda to string representation in {}".format( self.description), level=1) - def visit_UnicodeNode(self, node): - # Discard Unicode prefix in annotations. Any tool looking at them - # would probably expect Py3 string semantics. - self.emit_string(node, "") - def visit_AnnotationNode(self, node): - self.put(node.string.unicode_value) + self.put(node.string.value) class EmbedSignature(CythonTransform): diff --git a/Cython/Compiler/Builtin.py b/Cython/Compiler/Builtin.py index 4bc0656ac6e..d3ed6ddb064 100644 --- a/Cython/Compiler/Builtin.py +++ b/Cython/Compiler/Builtin.py @@ -151,7 +151,7 @@ def declare_in_type(self, self_type): #('bin', "", "", ""), BuiltinFunction('callable', "O", "b", "__Pyx_PyCallable_Check", utility_code = UtilityCode.load("CallableCheck", "ObjectHandling.c")), - BuiltinFunction('chr', "i", "O", "PyUnicode_FromOrdinal", builtin_return_type='unicode'), + BuiltinFunction('chr', "i", "O", "PyUnicode_FromOrdinal", builtin_return_type='str'), #('cmp', "", "", "", ""), # int PyObject_Cmp(PyObject *o1, PyObject *o2, int *result) #('compile', "", "", ""), # PyObject* Py_CompileString( char *str, char *filename, int start) BuiltinFunction('delattr', "OO", "r", "PyObject_DelAttr"), @@ -225,14 +225,13 @@ def declare_in_type(self, self_type): #('raw_input', "", "", ""), #('reduce', "", "", ""), BuiltinFunction('reload', "O", "O", "PyImport_ReloadModule"), - BuiltinFunction('repr', "O", "O", "PyObject_Repr", builtin_return_type='unicode'), + BuiltinFunction('repr', "O", "O", "PyObject_Repr", builtin_return_type='str'), #('round', "", "", ""), BuiltinFunction('setattr', "OOO", "r", "PyObject_SetAttr"), #('sum', "", "", ""), #('sorted', "", "", ""), #('type', "O", "O", "PyObject_Type"), - BuiltinFunction('unichr', "i", "O", "PyUnicode_FromOrdinal", builtin_return_type='unicode'), - #('unicode', "", "", ""), + BuiltinFunction('unichr', "i", "O", "PyUnicode_FromOrdinal", builtin_return_type='str'), #('vars', "", "", ""), #('zip', "", "", ""), # Can't do these easily until we have builtin type entries. @@ -286,12 +285,6 @@ def declare_in_type(self, self_type): BuiltinAttribute('imag', 'cval.imag', field_type = PyrexTypes.c_double_type), ]), - ("basestring", "&PyBaseString_Type", [ - BuiltinMethod("join", "TO", "T", "__Pyx_PyBaseString_Join", - utility_code=UtilityCode.load("StringJoin", "StringTools.c")), - BuiltinMethod("__mul__", "Tz", "T", "__Pyx_PySequence_Multiply", - utility_code=UtilityCode.load("PySequenceMultiply", "ObjectHandling.c")), - ]), ("bytearray", "&PyByteArray_Type", [ BuiltinMethod("__mul__", "Tz", "T", "__Pyx_PySequence_Multiply", utility_code=UtilityCode.load("PySequenceMultiply", "ObjectHandling.c")), @@ -301,12 +294,7 @@ def declare_in_type(self, self_type): BuiltinMethod("__mul__", "Tz", "T", "__Pyx_PySequence_Multiply", utility_code=UtilityCode.load("PySequenceMultiply", "ObjectHandling.c")), ]), - ("str", "&PyString_Type", [BuiltinMethod("join", "TO", "T", "__Pyx_PyString_Join", - utility_code=UtilityCode.load("StringJoin", "StringTools.c")), - BuiltinMethod("__mul__", "Tz", "T", "__Pyx_PySequence_Multiply", - utility_code=UtilityCode.load("PySequenceMultiply", "ObjectHandling.c")), - ]), - ("unicode", "&PyUnicode_Type", [BuiltinMethod("__contains__", "TO", "b", "PyUnicode_Contains"), + ("str", "&PyUnicode_Type", [BuiltinMethod("__contains__", "TO", "b", "PyUnicode_Contains"), BuiltinMethod("join", "TO", "T", "PyUnicode_Join"), BuiltinMethod("__mul__", "Tz", "T", "__Pyx_PySequence_Multiply", utility_code=UtilityCode.load("PySequenceMultiply", "ObjectHandling.c")), @@ -425,14 +413,14 @@ def declare_in_type(self, self_type): 'float': dict( as_integer_ratio='tuple[int,int]', is_integer='bint', - hex='unicode', + hex='str', fromhex='T', # classmethod ), 'list': dict( index='Py_ssize_t', count='Py_ssize_t', ), - 'unicode': dict( + 'str': dict( capitalize='T', casefold='T', center='T', @@ -482,12 +470,12 @@ def declare_in_type(self, self_type): zfill='T', ), 'bytes': dict( - hex='unicode', + hex='str', fromhex='T', # classmethod count='Py_ssize_t', removeprefix='T', removesuffix='T', - decode='unicode', + decode='str', endswith='bint', find='Py_ssize_t', index='Py_ssize_t', @@ -530,7 +518,7 @@ def declare_in_type(self, self_type): ), 'memoryview': dict( tobytes='bytes', - hex='unicode', + hex='str', tolist='list', toreadonly='T', cast='T', @@ -555,7 +543,6 @@ def declare_in_type(self, self_type): inferred_method_return_types['bytearray'].update(inferred_method_return_types['bytes']) inferred_method_return_types['frozenset'].update(inferred_method_return_types['set']) -inferred_method_return_types['str'] = inferred_method_return_types['unicode'] def find_return_type_of_builtin_method(builtin_type, method_name): @@ -619,6 +606,8 @@ def init_builtin_types(): objstruct_cname = 'PyByteArrayObject' elif name == 'int': objstruct_cname = 'PyLongObject' + elif name == 'str': + objstruct_cname = 'PyUnicodeObject' elif name == 'bool': objstruct_cname = None elif name == 'BaseException': @@ -640,6 +629,7 @@ def init_builtin_types(): for method in methods: method.declare_in_type(the_type) + def init_builtin_structs(): for name, cname, attribute_types in builtin_structs_table: scope = StructOrUnionScope(name) @@ -662,7 +652,7 @@ def init_builtins(): entry.utility_code = UtilityCode.load_cached("AssertionsEnabled", "Exceptions.c") global type_type, list_type, tuple_type, dict_type, set_type, frozenset_type, slice_type - global bytes_type, str_type, unicode_type, basestring_type, bytearray_type + global bytes_type, unicode_type, bytearray_type global float_type, int_type, bool_type, complex_type global memoryview_type, py_buffer_type global sequence_types @@ -675,9 +665,7 @@ def init_builtins(): slice_type = builtin_scope.lookup('slice').type bytes_type = builtin_scope.lookup('bytes').type - str_type = builtin_scope.lookup('str').type - unicode_type = builtin_scope.lookup('unicode').type - basestring_type = builtin_scope.lookup('basestring').type + unicode_type = builtin_scope.lookup('str').type bytearray_type = builtin_scope.lookup('bytearray').type memoryview_type = builtin_scope.lookup('memoryview').type @@ -690,9 +678,7 @@ def init_builtins(): list_type, tuple_type, bytes_type, - str_type, unicode_type, - basestring_type, bytearray_type, memoryview_type, ) diff --git a/Cython/Compiler/Code.pxd b/Cython/Compiler/Code.pxd index 14400e86d0a..c6431933854 100644 --- a/Cython/Compiler/Code.pxd +++ b/Cython/Compiler/Code.pxd @@ -82,7 +82,7 @@ cdef class StringConst: cdef public dict py_strings cdef public list py_versions - cpdef get_py_string_const(self, encoding, identifier=*, bint is_str=*, py3str_cstring=*) + cpdef get_py_string_const(self, encoding, identifier=*) ## cdef class PyStringConst: ## cdef public object cname diff --git a/Cython/Compiler/Code.py b/Cython/Compiler/Code.py index adb713b22eb..eaa67dd6ed8 100644 --- a/Cython/Compiler/Code.py +++ b/Cython/Compiler/Code.py @@ -7,7 +7,7 @@ cython.declare(os=object, re=object, operator=object, textwrap=object, Template=object, Naming=object, Options=object, StringEncoding=object, Utils=object, SourceDescriptor=object, StringIOTree=object, - DebugFlags=object, basestring=object, defaultdict=object, + DebugFlags=object, defaultdict=object, closing=object, partial=object, wraps=object) import hashlib @@ -1228,27 +1228,19 @@ def __init__(self, cname, text, byte_string): self.text = text self.escaped_value = StringEncoding.escape_byte_string(byte_string) self.py_strings = None - self.py_versions = [] - def add_py_version(self, version): - if not version: - self.py_versions = [2, 3] - elif version not in self.py_versions: - self.py_versions.append(version) - - def get_py_string_const(self, encoding, identifier=None, - is_str: cython.bint = False, py3str_cstring=None): + def get_py_string_const(self, encoding, identifier=None): text = self.text intern: cython.bint + is_unicode: cython.bint - is_str = bool(identifier or is_str) - is_unicode: cython.bint = encoding is None and not is_str - - if encoding is None: + if identifier or encoding is None: # unicode string - encoding_key = None + encoding = encoding_key = None + is_unicode = True else: - # bytes or str + # bytes + is_unicode = False encoding = encoding.lower() if encoding in ('utf8', 'utf-8', 'ascii', 'usascii', 'us-ascii'): encoding = None @@ -1256,15 +1248,6 @@ def get_py_string_const(self, encoding, identifier=None, else: encoding_key = ''.join(find_alphanums(encoding)) - key = (is_str, is_unicode, encoding_key, py3str_cstring) - if self.py_strings is None: - self.py_strings = {} - else: - try: - return self.py_strings[key] - except KeyError: - pass - if identifier: intern = True elif identifier is None: @@ -1275,15 +1258,23 @@ def get_py_string_const(self, encoding, identifier=None, else: intern = False + key = (intern, is_unicode, encoding_key) + if self.py_strings is None: + self.py_strings = {} + else: + try: + return self.py_strings[key] + except KeyError: + pass + pystring_cname = ( f"{Naming.interned_prefixes['str'] if intern else Naming.py_const_prefix}" - f"{'s' if is_str else 'u' if is_unicode else 'b'}" + f"{'u' if is_unicode else 'b'}" f"{'_' + encoding_key if encoding_key else ''}" f"_{self.cname[len(Naming.const_prefix):]}" ) - py_string = PyStringConst( - pystring_cname, encoding, is_unicode, is_str, py3str_cstring, intern) + py_string = PyStringConst(pystring_cname, encoding, intern, is_unicode) self.py_strings[key] = py_string return py_string @@ -1292,18 +1283,13 @@ class PyStringConst: """Global info about a Python string constant held by GlobalState. """ # cname string - # py3str_cstring string # encoding string # intern boolean # is_unicode boolean - # is_str boolean - def __init__(self, cname, encoding, is_unicode, is_str=False, - py3str_cstring=None, intern=False): + def __init__(self, cname, encoding, intern=False, is_unicode=False): self.cname = cname - self.py3str_cstring = py3str_cstring self.encoding = encoding - self.is_str = is_str self.is_unicode = is_unicode self.intern = intern @@ -1578,7 +1564,7 @@ def get_argument_default_const(self, type): # aren't just Python objects return c - def get_string_const(self, text, py_version=None): + def get_string_const(self, text): # return a C string constant, creating a new one if necessary if text.is_unicode: byte_string = text.utf8encode() @@ -1588,7 +1574,6 @@ def get_string_const(self, text, py_version=None): c = self.string_const_index[byte_string] except KeyError: c = self.new_string_const(text, byte_string) - c.add_py_version(py_version) return c def get_pyunicode_ptr_const(self, text): @@ -1600,18 +1585,10 @@ def get_pyunicode_ptr_const(self, text): c = self.pyunicode_ptr_const_index[text] = self.new_const_cname() return c - def get_py_string_const(self, text, identifier=None, - is_str=False, unicode_value=None): + def get_py_string_const(self, text, identifier=None): # return a Python string constant, creating a new one if necessary - py3str_cstring = None - if is_str and unicode_value is not None \ - and unicode_value.utf8encode() != text.byteencode(): - py3str_cstring = self.get_string_const(unicode_value, py_version=3) - c_string = self.get_string_const(text, py_version=2) - else: - c_string = self.get_string_const(text) - py_string = c_string.get_py_string_const( - text.encoding, identifier, is_str, py3str_cstring) + c_string = self.get_string_const(text) + py_string = c_string.get_py_string_const(text.encoding, identifier) return py_string def get_py_codeobj_const(self, node): @@ -1818,24 +1795,18 @@ def generate_string_constants(self): encodings = set() def normalise_encoding_name(py_string): - if py_string.is_str and py_string.encoding and py_string.encoding not in ( + if py_string.encoding and py_string.encoding not in ( 'ASCII', 'USASCII', 'US-ASCII', 'UTF8', 'UTF-8'): - return '"%s"' % py_string.encoding.lower() + return f'"{py_string.encoding.lower()}"' else: return '0' decls_writer = self.parts['string_decls'] for _, cname, c in c_consts: - conditional = False - if c.py_versions and (2 not in c.py_versions or 3 not in c.py_versions): - conditional = True - decls_writer.putln("#if PY_MAJOR_VERSION %s 3" % ( - (2 in c.py_versions) and '<' or '>=')) - decls_writer.putln('static const char %s[] = "%s";' % ( - cname, StringEncoding.split_string_literal(c.escaped_value)), + cliteral = StringEncoding.split_string_literal(c.escaped_value) + decls_writer.putln( + f'static const char {cname}[] = "{cliteral}";', safe=True) # Braces in user strings are not for indentation. - if conditional: - decls_writer.putln("#endif") if c.py_strings is not None: if len(c.escaped_value) > longest_pystring: # This is not an accurate count since it adds up C escape characters, @@ -1887,7 +1858,6 @@ def normalise_encoding_name(py_string): const Py_ssize_t encoding; #endif const unsigned int is_unicode : 1; - const unsigned int is_str : 1; const unsigned int intern : 1; } __Pyx_StringTabEntry; """ % dict( @@ -1910,16 +1880,8 @@ def normalise_encoding_name(py_string): w.putln("static const __Pyx_StringTabEntry %s[] = {" % Naming.stringtab_cname) for n, (c_cname, _, py_string) in enumerate(py_strings): - # TODO: 'py_string.py3str_cstring' can probably be removed - if py_string.py3str_cstring: - c_cname = py_string.py3str_cstring.cname - encodings_index = 0 - is_unicode = 1 - is_str = 0 - else: - encodings_index = encodings_map[normalise_encoding_name(py_string)] - is_unicode = py_string.is_unicode - is_str = py_string.is_str + encodings_index = encodings_map[normalise_encoding_name(py_string)] + is_unicode = py_string.is_unicode self.parts['module_state_defines'].putln("#define %s %s->%s[%s]" % ( py_string.cname, @@ -1927,16 +1889,15 @@ def normalise_encoding_name(py_string): Naming.stringtab_cname, n)) - w.putln("{%s, sizeof(%s), %d, %d, %d, %d}, /* PyObject cname: %s */" % ( + w.putln("{%s, sizeof(%s), %d, %d, %d}, /* PyObject cname: %s */" % ( c_cname, c_cname, encodings_index, is_unicode, - is_str, py_string.intern, py_string.cname )) - w.putln("{0, 0, 0, 0, 0, 0}") + w.putln("{0, 0, 0, 0, 0}") w.putln("};") self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c")) @@ -2354,10 +2315,8 @@ def get_string_const(self, text): def get_pyunicode_ptr_const(self, text): return self.globalstate.get_pyunicode_ptr_const(text) - def get_py_string_const(self, text, identifier=None, - is_str=False, unicode_value=None): - return self.globalstate.get_py_string_const( - text, identifier, is_str, unicode_value).cname + def get_py_string_const(self, text, identifier=None): + return self.globalstate.get_py_string_const(text, identifier).cname def get_py_codeobj_const(self, node): return self.globalstate.get_py_codeobj_const(node) diff --git a/Cython/Compiler/Dataclass.py b/Cython/Compiler/Dataclass.py index aa262bd1904..50d08d96082 100644 --- a/Cython/Compiler/Dataclass.py +++ b/Cython/Compiler/Dataclass.py @@ -709,7 +709,7 @@ def get_field_type(pos, entry): #) #return ExprNodes.IndexNode( # pos, base=annotations, - # index=ExprNodes.StringNode(pos, value=entry.name) + # index=ExprNodes.UnicodeNode(pos, value=entry.name) #) else: # it's slightly unclear what the best option is here - we could @@ -717,7 +717,7 @@ def get_field_type(pos, entry): # attributes defined with cdef so Cython is free to make it's own # decision s = EncodedString(entry.type.declaration_code("", for_display=1)) - return ExprNodes.StringNode(pos, value=s) + return ExprNodes.UnicodeNode(pos, value=s) class FieldRecordNode(ExprNodes.ExprNode): @@ -752,7 +752,7 @@ def _make_string(self): from .AutoDocTransforms import AnnotationWriter writer = AnnotationWriter(description="Dataclass field") string = writer.write(self.arg) - return ExprNodes.StringNode(self.pos, value=EncodedString(string)) + return ExprNodes.UnicodeNode(self.pos, value=EncodedString(string)) def generate_evaluation_code(self, code): return self.arg.generate_evaluation_code(code) diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index 44afa4e4d6a..81e85a09e32 100644 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -9,7 +9,7 @@ StringEncoding=object, operator=object, local_errors=object, report_error=object, Naming=object, Nodes=object, PyrexTypes=object, py_object_type=object, list_type=object, tuple_type=object, set_type=object, dict_type=object, - unicode_type=object, str_type=object, bytes_type=object, type_type=object, + unicode_type=object, bytes_type=object, type_type=object, Builtin=object, Symtab=object, Utils=object, find_coercion_error=object, debug_disposal_code=object, debug_temp_alloc=object, debug_coercion=object, bytearray_type=object, slice_type=object, memoryview_type=object, @@ -37,7 +37,7 @@ from . import TypeSlots from .Builtin import ( list_type, tuple_type, set_type, dict_type, type_type, - unicode_type, str_type, bytes_type, bytearray_type, basestring_type, + unicode_type, bytes_type, bytearray_type, slice_type, sequence_types as builtin_sequence_types, memoryview_type, ) from . import Builtin @@ -75,31 +75,15 @@ def __repr__(self): # error messages when coercing from key[0] to key[1] coercion_error_dict = { # string related errors - (unicode_type, str_type): ("Cannot convert Unicode string to 'str' implicitly." - " This is not portable and requires explicit encoding."), (unicode_type, bytes_type): "Cannot convert Unicode string to 'bytes' implicitly, encoding required.", (unicode_type, PyrexTypes.c_char_ptr_type): "Unicode objects only support coercion to Py_UNICODE*.", (unicode_type, PyrexTypes.c_const_char_ptr_type): "Unicode objects only support coercion to Py_UNICODE*.", (unicode_type, PyrexTypes.c_uchar_ptr_type): "Unicode objects only support coercion to Py_UNICODE*.", (unicode_type, PyrexTypes.c_const_uchar_ptr_type): "Unicode objects only support coercion to Py_UNICODE*.", - (bytes_type, unicode_type): "Cannot convert 'bytes' object to unicode implicitly, decoding required", - (bytes_type, str_type): "Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.", - (bytes_type, basestring_type): ("Cannot convert 'bytes' object to basestring implicitly." - " This is not portable to Py3."), - (bytes_type, PyrexTypes.c_py_unicode_ptr_type): "Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.", + (bytes_type, unicode_type): "Cannot convert 'bytes' object to str implicitly, decoding required", + (bytes_type, PyrexTypes.c_py_unicode_ptr_type): "Cannot convert 'bytes' object to Py_UNICODE*, use 'str'.", (bytes_type, PyrexTypes.c_const_py_unicode_ptr_type): ( - "Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'."), - (basestring_type, bytes_type): "Cannot convert 'basestring' object to bytes implicitly. This is not portable.", - (str_type, unicode_type): ("str objects do not support coercion to unicode," - " use a unicode string literal instead (u'')"), - (str_type, bytes_type): "Cannot convert 'str' to 'bytes' implicitly. This is not portable.", - (str_type, PyrexTypes.c_char_ptr_type): "'str' objects do not support coercion to C types (use 'bytes'?).", - (str_type, PyrexTypes.c_const_char_ptr_type): "'str' objects do not support coercion to C types (use 'bytes'?).", - (str_type, PyrexTypes.c_uchar_ptr_type): "'str' objects do not support coercion to C types (use 'bytes'?).", - (str_type, PyrexTypes.c_const_uchar_ptr_type): "'str' objects do not support coercion to C types (use 'bytes'?).", - (str_type, PyrexTypes.c_py_unicode_ptr_type): "'str' objects do not support coercion to C types (use 'unicode'?).", - (str_type, PyrexTypes.c_const_py_unicode_ptr_type): ( - "'str' objects do not support coercion to C types (use 'unicode'?)."), + "Cannot convert 'bytes' object to Py_UNICODE*, use 'str'."), (PyrexTypes.c_char_ptr_type, unicode_type): "Cannot convert 'char*' to unicode implicitly, decoding required", (PyrexTypes.c_const_char_ptr_type, unicode_type): ( "Cannot convert 'char*' to unicode implicitly, decoding required"), @@ -194,11 +178,6 @@ def make_dedup_key(outer_type, item_nodes): # For constants, look at the Python value type if we don't know the concrete Cython type. else (node.type, node.constant_result, type(node.constant_result) if node.type is py_object_type else None) if node.has_constant_result() - # IdentifierStringNode doesn't usually have a "constant_result" set because: - # 1. it doesn't usually have unicode_value - # 2. it's often created later in the compilation process after ConstantFolding - # but should be cacheable - else (node.type, node.value, node.unicode_value, "IdentifierStringNode") if isinstance(node, IdentifierStringNode) else None # something we cannot handle => short-circuit below for node in item_nodes ] @@ -1729,16 +1708,25 @@ def calculate_result_code(self): class UnicodeNode(ConstNode): - # A Py_UNICODE* or unicode literal + # A unicode literal # # value EncodedString # bytes_value BytesLiteral the literal parsed as bytes string # ('-3' unicode literals only) + # is_identifier boolean is_string_literal = True + is_identifier = None bytes_value = None type = unicode_type + def __init__(self, pos, value, bytes_value=None, type=None): + super().__init__(pos, value=value, constant_result=value) + if bytes_value is not None: + self.bytes_value = bytes_value + if type is not None and type is not unicode_type: + self.type = type + def calculate_constant_result(self): self.constant_result = self.value @@ -1746,20 +1734,14 @@ def analyse_as_type(self, env): return _analyse_name_as_type(self.value, self.pos, env) def as_sliced_node(self, start, stop, step=None): - if StringEncoding.string_contains_surrogates(self.value[:stop]): - # this is unsafe as it may give different results - # in different runtimes - return None - value = StringEncoding.EncodedString(self.value[start:stop:step]) - value.encoding = self.value.encoding + value = StringEncoding.encoded_string( + self.value[start:stop:step], self.value.encoding) if self.bytes_value is not None: bytes_value = StringEncoding.bytes_literal( self.bytes_value[start:stop:step], self.bytes_value.encoding) else: bytes_value = None - return UnicodeNode( - self.pos, value=value, bytes_value=bytes_value, - constant_result=value) + return UnicodeNode(self.pos, value=value, bytes_value=bytes_value) def coerce_to(self, dst_type, env): if dst_type is self.type: @@ -1794,7 +1776,7 @@ def coerce_to(self, dst_type, env): "Unicode literals do not support coercion to C types other " "than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* " "(for strings).") - elif dst_type not in (py_object_type, Builtin.basestring_type): + elif dst_type is not py_object_type: self.check_for_coercion_error(dst_type, env, fail=True) return self @@ -1840,6 +1822,8 @@ def generate_evaluation_code(self, code): const_code.error_goto_if_null(self.result_code, self.pos))) const_code.put_error_if_neg( self.pos, "__Pyx_PyUnicode_READY(%s)" % self.result_code) + elif self.is_identifier: + self.result_code = code.intern_identifier(self.value) else: self.result_code = code.get_py_string_const(self.value) else: @@ -1848,82 +1832,12 @@ def generate_evaluation_code(self, code): def calculate_result_code(self): return self.result_code - def compile_time_value(self, env): + def compile_time_value(self, denv): return self.value -class StringNode(PyConstNode): - # A Python str object, i.e. a byte string in Python 2.x and a - # unicode string in Python 3.x - # - # value BytesLiteral (or EncodedString with ASCII content) - # unicode_value EncodedString or None - # is_identifier boolean - - type = str_type - is_string_literal = True - is_identifier = None - unicode_value = None - - def calculate_constant_result(self): - if self.unicode_value is not None: - # only the Unicode value is portable across Py2/3 - self.constant_result = self.unicode_value - - def analyse_as_type(self, env): - return _analyse_name_as_type(self.unicode_value or self.value.decode('ISO8859-1'), self.pos, env) - - def as_sliced_node(self, start, stop, step=None): - value = type(self.value)(self.value[start:stop:step]) - value.encoding = self.value.encoding - if self.unicode_value is not None: - if StringEncoding.string_contains_surrogates(self.unicode_value[:stop]): - # this is unsafe as it may give different results in different runtimes - return None - unicode_value = StringEncoding.EncodedString( - self.unicode_value[start:stop:step]) - else: - unicode_value = None - return StringNode( - self.pos, value=value, unicode_value=unicode_value, - constant_result=value, is_identifier=self.is_identifier) - - def coerce_to(self, dst_type, env): - if dst_type is not py_object_type and not str_type.subtype_of(dst_type): -# if dst_type is Builtin.bytes_type: -# # special case: bytes = 'str literal' -# return BytesNode(self.pos, value=self.value) - if not dst_type.is_pyobject: - return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env) - if dst_type is not Builtin.basestring_type: - self.check_for_coercion_error(dst_type, env, fail=True) - return self - - def can_coerce_to_char_literal(self): - return not self.is_identifier and len(self.value) == 1 - - def generate_evaluation_code(self, code): - self.result_code = code.get_py_string_const( - self.value, identifier=self.is_identifier, is_str=True, - unicode_value=self.unicode_value) - - def get_constant_c_result_code(self): - return None - - def calculate_result_code(self): - return self.result_code - - def compile_time_value(self, env): - if self.value.is_unicode: - return self.value - if self.unicode_value is not None: - return self.unicode_value - return self.value.decode('iso8859-1') - - -class IdentifierStringNode(StringNode): - # A special str value that represents an identifier (bytes in Py2, - # unicode in Py3). +class IdentifierStringNode(UnicodeNode): + # A special str value that represents an identifier (a Unicode name). is_identifier = True @@ -2051,6 +1965,7 @@ def type_dependencies(self, env): def infer_type(self, env): if self.entry is None: self.entry = env.lookup(self.name) + if self.entry is None or self.entry.type is unspecified_type: if self.inferred_type is not None: return self.inferred_type @@ -2060,6 +1975,9 @@ def infer_type(self, env): # Unfortunately the type attribute of type objects # is used for the pointer to the type they represent. return type_type + elif self.entry.type is unicode_type and self.name in ('unicode', 'basestring'): + # Keep recognising the old Py2 names for 'str' as type. + return type_type elif self.entry.type.is_cfunction: if self.entry.scope.is_builtin_scope: # special case: optimised builtin functions must be treated as Python objects @@ -2793,7 +2711,7 @@ class ImportNode(ExprNode): # Implements result = # __import__(module_name, globals(), None, name_list, level) # - # module_name StringNode dotted name of module. Empty module + # module_name UnicodeNode dotted name of module. Empty module # name means importing the parent package according # to level # name_list ListNode or None list of names to be imported @@ -2831,7 +2749,7 @@ def analyse_types(self, env): self.name_list = name_list.coerce_to_pyobject(env) elif '.' in self.module_name.value: self.module_names = TupleNode(self.module_name.pos, args=[ - IdentifierStringNode(self.module_name.pos, value=part, constant_result=part) + IdentifierStringNode(self.module_name.pos, value=part) for part in map(StringEncoding.EncodedString, self.module_name.value.split('.')) ]).analyse_types(env) return self @@ -3725,7 +3643,7 @@ class FormattedValueNode(ExprNode): gil_message = "String formatting" find_conversion_func = { - 's': 'PyObject_Unicode', + 's': 'PyObject_Str', 'r': 'PyObject_Repr', 'a': 'PyObject_ASCII', 'd': '__Pyx_PyNumber_Long', # NOTE: internal mapping for '%d' formatting @@ -3876,7 +3794,7 @@ def is_ephemeral(self): # in most cases, indexing will return a safe reference to an object in a container, # so we consider the result safe if the base object is return self.base.is_ephemeral() or self.base.type in ( - basestring_type, str_type, bytes_type, bytearray_type, unicode_type) + unicode_type, bytes_type, bytearray_type) def check_const_addr(self): return self.base.check_const_addr() and self.index.check_const() @@ -3934,7 +3852,7 @@ def may_be_none(self): if base_type: if base_type.is_string: return False - if base_type in (unicode_type, bytes_type, str_type, bytearray_type, basestring_type): + if base_type in (unicode_type, bytes_type, bytearray_type): return False if isinstance(self.index, SliceNode): # slicing! @@ -4005,8 +3923,7 @@ def infer_type(self, env): elif base_type.is_pyunicode_ptr: # sliced Py_UNICODE* strings must coerce to Python return unicode_type - elif base_type in (unicode_type, bytes_type, str_type, - bytearray_type, list_type, tuple_type): + elif base_type in (unicode_type, bytes_type, bytearray_type, list_type, tuple_type): # slicing these returns the same type return base_type elif base_type.is_memoryviewslice: @@ -4027,17 +3944,8 @@ def infer_type(self, env): # to receive it, throw it away, and potentially rebuild it # on a subsequent PyObject coercion. return PyrexTypes.c_py_ucs4_type - elif base_type is str_type: - # always returns str - Py2: bytes, Py3: unicode - return base_type - elif base_type is bytearray_type: + elif base_type is bytearray_type or self.base is bytes_type: return PyrexTypes.c_uchar_type - elif isinstance(self.base, BytesNode): - #if env.global_scope().context.language_level >= 3: - # # inferring 'char' can be made to work in Python 3 mode - # return PyrexTypes.c_char_type - # Py2/3 return different types on indexing bytes objects - return py_object_type elif base_type in (tuple_type, list_type): # if base is a literal, take a look at its values item_type = infer_sequence_item_type( @@ -4088,12 +3996,12 @@ def __init__(self, **kwds): return PythranExpr(pythran_indexing_type(base_type, [index_with_type])) # may be slicing or indexing, we don't know - if base_type in (unicode_type, str_type): - # these types always returns their own type on Python indexing/slicing + if base_type is unicode_type: + # always returns its own type on Python indexing/slicing return base_type - else: - # TODO: Handle buffers (hopefully without too much redundancy). - return py_object_type + + # TODO: Handle buffers (hopefully without too much redundancy). + return py_object_type def analyse_types(self, env): return self.analyse_base_and_index_types(env, getting=True) @@ -4215,7 +4123,7 @@ def analyse_as_pyobject(self, env, is_slice, getting, setting): else: # not using 'uchar' to enable fast and safe error reporting as '-1' self.type = PyrexTypes.c_int_type - elif is_slice and base_type in (bytes_type, bytearray_type, str_type, unicode_type, list_type, tuple_type): + elif is_slice and base_type in (bytes_type, bytearray_type, unicode_type, list_type, tuple_type): self.type = base_type else: item_type = None @@ -4543,7 +4451,7 @@ def generate_result_code(self, code): if base_type is dict_type: function = "__Pyx_PyDict_GetItem" utility_code = UtilityCode.load_cached("DictGetItem", "ObjectHandling.c") - elif base_type is py_object_type and self.index.type in (str_type, unicode_type): + elif base_type is py_object_type and self.index.type is unicode_type: # obj[str] is probably doing a dict lookup function = "__Pyx_PyObject_Dict_GetItem" utility_code = UtilityCode.load_cached("DictGetItem", "ObjectHandling.c") @@ -5302,8 +5210,8 @@ def infer_type(self, env): return bytes_type elif base_type.is_pyunicode_ptr: return unicode_type - elif base_type in (bytes_type, bytearray_type, str_type, unicode_type, - basestring_type, list_type, tuple_type): + elif base_type in (bytes_type, bytearray_type, unicode_type, + list_type, tuple_type): return base_type elif base_type.is_ptr or base_type.is_array: return PyrexTypes.c_array_type(base_type.base_type, None) @@ -5323,8 +5231,8 @@ def may_be_none(self): if base_type: if base_type.is_string: return False - if base_type in (bytes_type, str_type, unicode_type, - basestring_type, list_type, tuple_type): + if base_type in (bytes_type, bytearray_type, unicode_type, + list_type, tuple_type): return False return ExprNode.may_be_none(self) @@ -5492,9 +5400,8 @@ def nogil_check(self, env): def coerce_to(self, dst_type, env): if ((self.base.type.is_string or self.base.type.is_cpp_string) - and dst_type in (bytes_type, bytearray_type, str_type, unicode_type)): - if (dst_type not in (bytes_type, bytearray_type) - and not env.directives['c_string_encoding']): + and dst_type in (bytes_type, bytearray_type, unicode_type)): + if (dst_type is unicode_type and not env.directives['c_string_encoding']): error(self.pos, "default encoding required for conversion from '%s' to '%s'" % (self.base.type, dst_type)) @@ -5521,7 +5428,7 @@ def generate_result_code(self, code): base_result = self.base.result_as(PyrexTypes.c_const_char_ptr_type) if self.type is bytearray_type: type_name = 'ByteArray' - elif self.type is str_type: + elif self.type is unicode_type: type_name = 'Unicode' else: type_name = self.type.name.title() @@ -5912,9 +5819,12 @@ def infer_type(self, env): if result_type.is_extension_type: return result_type elif result_type.is_builtin_type: - if function.entry.name == 'float': + func_name = function.entry.name + if func_name == 'float': return PyrexTypes.c_double_type - elif function.entry.name in Builtin.types_that_construct_their_instance: + elif func_name == 'bool': + return PyrexTypes.c_bint_type + elif func_name in Builtin.types_that_construct_their_instance: return result_type func_type = self.function.analyse_as_type(env) if func_type and (func_type.is_struct_or_union or func_type.is_cpp_class): @@ -5988,7 +5898,7 @@ def analyse_as_type_constructor(self, env): args, kwds = self.explicit_args_kwds() items = [] for arg, member in zip(args, type.scope.var_entries): - items.append(DictItemNode(pos=arg.pos, key=StringNode(pos=arg.pos, value=member.name), value=arg)) + items.append(DictItemNode(arg.pos, key=UnicodeNode(arg.pos, value=member.name), value=arg)) if kwds: items += kwds.key_value_pairs self.key_value_pairs = items @@ -6273,7 +6183,8 @@ def analyse_c_function_call(self, env): for i in range(max_nargs, actual_nargs): arg = args[i] if arg.type.is_pyobject: - if arg.type is str_type: + if arg.type is unicode_type: + # TODO: require "arg.type.bytes_value"? arg_ctype = PyrexTypes.c_char_ptr_type else: arg_ctype = arg.type.default_coerced_ctype() @@ -6673,10 +6584,7 @@ def attribute_is_likely_method(attr): )) if kwargs_key_value_pairs: for n, keyvalue in enumerate(kwargs_key_value_pairs): - key_is_str = ( - (keyvalue.key.type is Builtin.str_type or keyvalue.key.type is Builtin.unicode_type) - and not keyvalue.key.may_be_none() - ) + key_is_str = keyvalue.key.type is Builtin.unicode_type and not keyvalue.key.may_be_none() code.put_error_if_neg( self.pos, "__Pyx_VectorcallBuilder_AddArg%s(%s, %s, %s, __pyx_callargs+%d, %d)" % ( @@ -9405,7 +9313,7 @@ def coerce_to(self, dst_type, env): item.key = item.key.arg if not item.key.is_string_literal: error(item.key.pos, "Invalid struct field identifier") - item.key = StringNode(item.key.pos, value="") + item.key = UnicodeNode(item.key.pos, value=StringEncoding.EncodedString("")) else: key = str(item.key.value) # converts string literals to unicode in Py3 member = dst_type.scope.lookup_here(key) @@ -10207,8 +10115,8 @@ def node_positions_to_offset(self): class CodeObjectNode(ExprNode): # Create a PyCodeObject for a CyFunction instance. # - # def_node DefNode the Python function node - # varnames [StringNode] a list of all local variable names + # def_node DefNode the Python function node + # varnames [IdentifierStringNode] a list of all local variable names subexprs = ['varnames'] is_temp = False @@ -10244,16 +10152,14 @@ def generate_codeobj(self, code, error_label): func = self.def_node first_lineno = self.pos[1] - func_name_result = code.get_py_string_const( - func.name, identifier=True, is_str=False, unicode_value=func.name) + func_name_result = code.get_py_string_const(func.name, identifier=True) # FIXME: better way to get the module file path at module init time? Encoding to use? file_path = func.pos[0].get_filenametable_entry() if os.path.isabs(file_path): file_path = func.pos[0].get_description() # Always use / as separator - file_path = pathlib.Path(file_path).as_posix() - file_path = StringEncoding.bytes_literal(file_path.encode('utf-8'), 'utf8') - file_path_result = code.get_py_string_const(file_path, identifier=False, is_str=True) + file_path = StringEncoding.EncodedString(pathlib.Path(file_path).as_posix()) + file_path_result = code.get_py_string_const(file_path) if func.node_positions: line_table = StringEncoding.bytes_literal(build_line_table(func.node_positions, first_lineno).encode('iso8859-1'), 'iso8859-1') @@ -11772,7 +11678,7 @@ class TypeofNode(ExprNode): # Compile-time type of an expression, as a string. # # operand ExprNode - # literal StringNode # internal + # literal UnicodeNode # internal literal = None type = py_object_type @@ -11782,7 +11688,7 @@ class TypeofNode(ExprNode): def analyse_types(self, env): self.operand = self.operand.analyse_types(env) value = StringEncoding.EncodedString(str(self.operand.type)) #self.operand.type.typeof_name()) - literal = StringNode(self.pos, value=value) + literal = UnicodeNode(self.pos, value=value) literal = literal.analyse_types(env) self.literal = literal.coerce_to_pyobject(env) return self @@ -12256,7 +12162,7 @@ def is_py_operation_types(self, type1, type2): def infer_builtin_types_operation(self, type1, type2): # b'abc' + 'abc' raises an exception in Py3, # so we can safely infer a mix here. - string_types = (bytes_type, bytearray_type, basestring_type, str_type, unicode_type) + string_types = (bytes_type, bytearray_type, unicode_type) if type1 in string_types and type2 in string_types: return string_types[max(string_types.index(type1), string_types.index(type2))] @@ -12276,7 +12182,7 @@ def py_operation_function(self, code): type1, type2 = self.operand1.type, self.operand2.type func = None if type1 is unicode_type or type2 is unicode_type: - if type1 in (unicode_type, str_type) and type2 in (unicode_type, str_type): + if type1 is unicode_type and type2 is unicode_type: is_unicode_concat = True elif isinstance(self.operand1, FormattedValueNode) or isinstance(self.operand2, FormattedValueNode): # Assume that even if we don't know the second type, it's going to be a string. @@ -12290,10 +12196,6 @@ def py_operation_function(self, code): code.globalstate.use_utility_code( UtilityCode.load_cached("UnicodeConcatInPlace", "ObjectHandling.c")) func = '__Pyx_PyUnicode_Concat' - elif type1 is str_type and type2 is str_type: - code.globalstate.use_utility_code( - UtilityCode.load_cached("StrConcatInPlace", "ObjectHandling.c")) - func = '__Pyx_PyStr_Concat' if func: # any necessary utility code will be got by "NumberAdd" in generate_evaluation_code @@ -12634,8 +12536,7 @@ def is_py_operation_types(self, type1, type2): or NumBinopNode.is_py_operation_types(self, type1, type2)) def infer_builtin_types_operation(self, type1, type2): - # b'%s' % xyz raises an exception in Py3<3.5, so it's safe to infer the type for later Py3's. - if type1 in (unicode_type, bytes_type, str_type, basestring_type): + if type1 in (unicode_type, bytes_type, bytearray_type): # 'None % xyz' may be implemented by the RHS, but everything else will do string formatting. if type2.is_builtin_type or not type2.is_pyobject or not self.operand1.may_be_none(): return type1 @@ -12699,13 +12600,6 @@ def py_operation_function(self, code): return '__Pyx_PyUnicode_FormatSafe' else: return 'PyUnicode_Format' - elif type1 is str_type: - if self.operand1.may_be_none() or ( - type2.is_extension_type and type2.subtype_of(type1) or - type2 is py_object_type and not isinstance(self.operand2, CoerceToPyTypeNode)): - return '__Pyx_PyString_FormatSafe' - else: - return '__Pyx_PyString_Format' return super().py_operation_function(code) @@ -13351,14 +13245,8 @@ def find_common_type(self, env, op, operand1, common_type=None): new_common_type = None - # catch general errors - if (type1 == str_type and (type2.is_string or type2 in (bytes_type, unicode_type)) or - type2 == str_type and (type1.is_string or type1 in (bytes_type, unicode_type))): - error(self.pos, "Comparisons between bytes/unicode and str are not portable to Python 3") - new_common_type = error_type - # try to use numeric comparisons where possible - elif type1.is_complex or type2.is_complex: + if type1.is_complex or type2.is_complex: if (op not in ('==', '!=') and (type1.is_complex or type1.is_numeric) and (type2.is_complex or type2.is_numeric)): @@ -13479,14 +13367,6 @@ def find_special_bool_compare_function(self, env, operand1, result_is_bool=False self.special_bool_cmp_utility_code = UtilityCode.load_cached("BytesEquals", "StringTools.c") self.special_bool_cmp_function = "__Pyx_PyBytes_Equals" return True - elif type1 is Builtin.basestring_type or type2 is Builtin.basestring_type: - self.special_bool_cmp_utility_code = UtilityCode.load_cached("UnicodeEquals", "StringTools.c") - self.special_bool_cmp_function = "__Pyx_PyUnicode_Equals" - return True - elif type1 is Builtin.str_type or type2 is Builtin.str_type: - self.special_bool_cmp_utility_code = UtilityCode.load_cached("StrEquals", "StringTools.c") - self.special_bool_cmp_function = "__Pyx_PyString_Equals" - return True elif result_is_bool: from .Optimize import optimise_numeric_binop result = optimise_numeric_binop( @@ -14813,6 +14693,7 @@ def generate_result_code(self, code): code.error_goto_if_null(self.result(), self.pos))) self.generate_gotref(code) + class AnnotationNode(ExprNode): # Deals with the two possible uses of an annotation. # 1. The post PEP-563 use where an annotation is stored @@ -14831,14 +14712,14 @@ class AnnotationNode(ExprNode): untyped = False def __init__(self, pos, expr, string=None): - """string is expected to already be a StringNode or None""" + """string is expected to already be a UnicodeNode or None""" ExprNode.__init__(self, pos) if string is None: # import doesn't work at top of file? from .AutoDocTransforms import AnnotationWriter - string = StringEncoding.EncodedString( + string_value = StringEncoding.EncodedString( AnnotationWriter(description="annotation").write(expr)) - string = StringNode(pos, unicode_value=string, value=string.as_utf8_string()) + string = UnicodeNode(pos, value=string_value) self.string = string self.expr = expr diff --git a/Cython/Compiler/FusedNode.py b/Cython/Compiler/FusedNode.py index 712ed27be48..1b0ceb72ca0 100644 --- a/Cython/Compiler/FusedNode.py +++ b/Cython/Compiler/FusedNode.py @@ -932,7 +932,7 @@ def synthesize_defnodes(self, nodes): signatures = [StringEncoding.EncodedString(node.specialized_signature_string) for node in nodes] - keys = [ExprNodes.StringNode(node.pos, value=sig) + keys = [ExprNodes.UnicodeNode(node.pos, value=sig) for node, sig in zip(nodes, signatures)] values = [ExprNodes.PyCFunctionNode.from_defnode(node, binding=True) for node in nodes] diff --git a/Cython/Compiler/Interpreter.py b/Cython/Compiler/Interpreter.py index 774c0c31475..fb0f8c255d0 100644 --- a/Cython/Compiler/Interpreter.py +++ b/Cython/Compiler/Interpreter.py @@ -7,8 +7,7 @@ """ -from .Nodes import * -from .ExprNodes import * +from .ExprNodes import DictNode from .Errors import CompileError @@ -44,10 +43,7 @@ def interpret(node, ix): return (type, node.pos) else: raise CompileError(node.pos, "Type not allowed here.") - else: - if isinstance(node, StringNode) and node.unicode_value is not None: - return (node.unicode_value, node.pos) - return (node.compile_time_value(empty_scope), node.pos) + return (node.compile_time_value(empty_scope), node.pos) if optlist: optlist = [interpret(x, ix) for ix, x in enumerate(optlist)] diff --git a/Cython/Compiler/Nodes.py b/Cython/Compiler/Nodes.py index b21163b7139..4d351ce033f 100644 --- a/Cython/Compiler/Nodes.py +++ b/Cython/Compiler/Nodes.py @@ -5133,7 +5133,7 @@ def __init__(self, pos, name, bases, doc, body, decorators=None, from . import ExprNodes if self.doc and Options.docstrings: doc = embed_position(self.pos, self.doc) - doc_node = ExprNodes.StringNode(pos, value=doc) + doc_node = ExprNodes.UnicodeNode(pos, value=doc) self.doc_node = ExprNodes.NameNode(name=EncodedString('__doc__'), type=py_object_type, pos=pos) else: doc_node = None diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py index 27c7fa6af9f..7cbc997a1e2 100644 --- a/Cython/Compiler/Optimize.py +++ b/Cython/Compiler/Optimize.py @@ -1,14 +1,19 @@ +import string +import cython +cython.declare(UtilityCode=object, EncodedString=object, bytes_literal=object, encoded_string=object, + Nodes=object, ExprNodes=object, PyrexTypes=object, Builtin=object, + UtilNodes=object, _py_int_types=object, + re=object, copy=object, codecs=object, itertools=object, attrgetter=object) + + import re import copy import codecs import itertools +from operator import attrgetter from . import TypeSlots -from .ExprNodes import not_a_constant -import cython -cython.declare(UtilityCode=object, EncodedString=object, bytes_literal=object, encoded_string=object, - Nodes=object, ExprNodes=object, PyrexTypes=object, Builtin=object, - UtilNodes=object, _py_int_types=object) +from .ExprNodes import UnicodeNode, not_a_constant _py_string_types = (bytes, str) @@ -996,8 +1001,7 @@ def _transform_dict_iteration(self, node, dict_obj, method, keys, values): body.stats[0:0] = [iter_next_node] if method: - method_node = ExprNodes.StringNode( - dict_obj.pos, is_identifier=True, value=method) + method_node = ExprNodes.IdentifierStringNode(dict_obj.pos, value=method) dict_obj = dict_obj.as_none_safe_node( "'NoneType' object has no attribute '%{}s'".format('.30' if len(method) <= 30 else ''), error = "PyExc_AttributeError", @@ -1705,14 +1709,6 @@ def _handle_simple_function_ord(self, node, pos_args): value=str(ord(arg.value)), constant_result=ord(arg.value) ) - elif isinstance(arg, ExprNodes.StringNode): - if arg.unicode_value and len(arg.unicode_value) == 1 \ - and ord(arg.unicode_value) <= 255: # Py2/3 portability - return ExprNodes.IntNode( - arg.pos, type=PyrexTypes.c_int_type, - value=str(ord(arg.unicode_value)), - constant_result=ord(arg.unicode_value) - ) return node # sequence processing @@ -2421,47 +2417,12 @@ def _optimise_generic_builtin_method_call(self, node, attr_name, function, arg_l return node if not function.obj.type.is_builtin_type: return node - if function.obj.type.name in ('basestring', 'type'): - # these allow different actual types => unsafe + if function.obj.type is Builtin.type_type: + # allows different actual types => unsafe return node return ExprNodes.CachedBuiltinMethodCallNode( node, function.obj, attr_name, arg_list) - PyObject_String_func_type = PyrexTypes.CFuncType( - Builtin.unicode_type, [ - PyrexTypes.CFuncTypeArg("obj", PyrexTypes.py_object_type, None) - ]) - - def _handle_simple_function_str(self, node, function, pos_args): - """Optimize single argument calls to str(). - """ - if node.type is Builtin.unicode_type: - # type already deduced as unicode (language_level=3) - return self._handle_simple_function_unicode(node, function, pos_args) - if len(pos_args) != 1: - if len(pos_args) == 0: - return ExprNodes.StringNode(node.pos, value=EncodedString(), constant_result='') - return node - arg = pos_args[0] - - if arg.type is Builtin.str_type: - if not arg.may_be_none(): - return arg - - cname = "__Pyx_PyStr_Str" - utility_code = UtilityCode.load_cached('PyStr_Str', 'StringTools.c') - else: - cname = '__Pyx_PyObject_Str' - utility_code = UtilityCode.load_cached('PyObject_Str', 'StringTools.c') - - return ExprNodes.PythonCapiCallNode( - node.pos, cname, self.PyObject_String_func_type, - args=pos_args, - is_temp=node.is_temp, - utility_code=utility_code, - py_name="str" - ) - PyObject_Unicode_func_type = PyrexTypes.CFuncType( Builtin.unicode_type, [ PyrexTypes.CFuncTypeArg("obj", PyrexTypes.py_object_type, None) @@ -2472,7 +2433,7 @@ def _handle_simple_function_unicode(self, node, function, pos_args): """ if len(pos_args) != 1: if len(pos_args) == 0: - return ExprNodes.UnicodeNode(node.pos, value=EncodedString(), constant_result='') + return ExprNodes.UnicodeNode(node.pos, value=EncodedString()) return node arg = pos_args[0] if arg.type is Builtin.unicode_type: @@ -2490,6 +2451,8 @@ def _handle_simple_function_unicode(self, node, function, pos_args): utility_code=utility_code, py_name="unicode") + _handle_simple_function_str = _handle_simple_function_unicode + def visit_FormattedValueNode(self, node): """Simplify or avoid plain string formatting of a unicode value. This seems misplaced here, but plain unicode formatting is essentially @@ -2666,9 +2629,6 @@ def _handle_simple_function_float(self, node, function, pos_args): elif func_arg.type is Builtin.unicode_type: cfunc_name = "__Pyx_PyUnicode_AsDouble" utility_code_name = 'pyunicode_as_double' - elif func_arg.type is Builtin.str_type: - cfunc_name = "__Pyx_PyString_AsDouble" - utility_code_name = 'pystring_as_double' elif func_arg.type is Builtin.int_type: cfunc_name = "PyLong_AsDouble" utility_code_name = None @@ -2956,21 +2916,13 @@ def _handle_simple_function_ord(self, node, function, pos_args): return ExprNodes.TypecastNode( arg.pos, operand=arg.arg, type=PyrexTypes.c_long_type ).coerce_to(node.type, self.current_env()) - elif isinstance(arg, ExprNodes.UnicodeNode): + elif isinstance(arg, (ExprNodes.UnicodeNode, ExprNodes.BytesNode)): if len(arg.value) == 1: return ExprNodes.IntNode( arg.pos, type=PyrexTypes.c_int_type, value=str(ord(arg.value)), constant_result=ord(arg.value) ).coerce_to(node.type, self.current_env()) - elif isinstance(arg, ExprNodes.StringNode): - if arg.unicode_value and len(arg.unicode_value) == 1 \ - and ord(arg.unicode_value) <= 255: # Py2/3 portability - return ExprNodes.IntNode( - arg.pos, type=PyrexTypes.c_int_type, - value=str(ord(arg.unicode_value)), - constant_result=ord(arg.unicode_value) - ).coerce_to(node.type, self.current_env()) return node ### special methods @@ -3723,12 +3675,12 @@ def _handle_simple_method_unicode_join(self, node, function, args, is_unbound_me def _handle_simple_method_unicode_endswith(self, node, function, args, is_unbound_method): return self._inject_tailmatch( - node, function, args, is_unbound_method, 'unicode', 'endswith', + node, function, args, is_unbound_method, 'str', 'endswith', unicode_tailmatch_utility_code, +1) def _handle_simple_method_unicode_startswith(self, node, function, args, is_unbound_method): return self._inject_tailmatch( - node, function, args, is_unbound_method, 'unicode', 'startswith', + node, function, args, is_unbound_method, 'str', 'startswith', unicode_tailmatch_utility_code, -1) def _inject_tailmatch(self, node, function, args, is_unbound_method, type_name, @@ -3737,7 +3689,7 @@ def _inject_tailmatch(self, node, function, args, is_unbound_method, type_name, by a direct call to the corresponding C-API function. """ if len(args) not in (2,3,4): - self._error_wrong_arg_count('%s.%s' % (type_name, method_name), node, args, "2-4") + self._error_wrong_arg_count(f"{type_name}.{method_name}", node, args, "2-4") return node self._inject_int_default_argument( node, args, 2, PyrexTypes.c_py_ssize_t_type, "0") @@ -3746,10 +3698,14 @@ def _inject_tailmatch(self, node, function, args, is_unbound_method, type_name, args.append(ExprNodes.IntNode( node.pos, value=str(direction), type=PyrexTypes.c_int_type)) + if type_name == 'str': + func_name = "__Pyx_PyUnicode_Tailmatch" + else: + func_name = f"__Pyx_Py{type_name.capitalize()}_Tailmatch" + method_call = self._substitute_method_call( node, function, - "__Pyx_Py%s_Tailmatch" % type_name.capitalize(), - self.PyString_Tailmatch_func_type, + func_name, self.PyString_Tailmatch_func_type, method_name, is_unbound_method, args, utility_code = utility_code) return method_call.coerce_to(Builtin.bool_type, self.current_env()) @@ -3945,21 +3901,25 @@ def _handle_simple_method_bytes_decode(self, node, function, args, is_unbound_me return node # Try to extract encoding parameters and attempt constant decode. + string_node = args[0] parameters = self._unpack_encoding_and_error_mode(node.pos, args) if parameters is None: return node encoding, encoding_node, error_handling, error_handling_node = parameters - if args[0].has_constant_result(): + if string_node.has_constant_result(): try: - constant_result = args[0].constant_result.decode(encoding, error_handling) + constant_result = string_node.constant_result.decode(encoding, error_handling) except (AttributeError, ValueError, UnicodeDecodeError): pass else: - return UnicodeNode(args[0].pos, value=encoded_string(constant_result, encoding)) + return UnicodeNode( + string_node.pos, + value=EncodedString(constant_result), + bytes_value=string_node.constant_result, + ) # normalise input nodes - string_node = args[0] start = stop = None if isinstance(string_node, ExprNodes.SliceIndexNode): index_node = string_node @@ -4108,7 +4068,7 @@ def _unpack_string_and_cstring_node(self, node): encoding = node.value node = ExprNodes.BytesNode( node.pos, value=encoding.as_utf8_string(), type=PyrexTypes.c_const_char_ptr_type) - elif isinstance(node, (ExprNodes.StringNode, ExprNodes.BytesNode)): + elif isinstance(node, ExprNodes.BytesNode): encoding = node.value.decode('ISO-8859-1') node = ExprNodes.BytesNode( node.pos, value=node.value, type=PyrexTypes.c_const_char_ptr_type) @@ -4121,16 +4081,6 @@ def _unpack_string_and_cstring_node(self, node): encoding = node = None return encoding, node - def _handle_simple_method_str_endswith(self, node, function, args, is_unbound_method): - return self._inject_tailmatch( - node, function, args, is_unbound_method, 'str', 'endswith', - str_tailmatch_utility_code, +1) - - def _handle_simple_method_str_startswith(self, node, function, args, is_unbound_method): - return self._inject_tailmatch( - node, function, args, is_unbound_method, 'str', 'startswith', - str_tailmatch_utility_code, -1) - def _handle_simple_method_bytes_endswith(self, node, function, args, is_unbound_method): return self._inject_tailmatch( node, function, args, is_unbound_method, 'bytes', 'endswith', @@ -4299,7 +4249,6 @@ def optimise_numeric_binop(operator, node, ret_type, arg0, arg1): unicode_tailmatch_utility_code = UtilityCode.load_cached('unicode_tailmatch', 'StringTools.c') bytes_tailmatch_utility_code = UtilityCode.load_cached('bytes_tailmatch', 'StringTools.c') -str_tailmatch_utility_code = UtilityCode.load_cached('str_tailmatch', 'StringTools.c') class ConstantFolding(Visitor.VisitorTransform, SkipDeclarations): @@ -4532,8 +4481,7 @@ def visit_AddNode(self, node): str1.bytes_value + str2.bytes_value, str1.bytes_value.encoding) string_value = EncodedString(node.constant_result) - return ExprNodes.UnicodeNode( - str1.pos, value=string_value, constant_result=node.constant_result, bytes_value=bytes_value) + return ExprNodes.UnicodeNode(str1.pos, value=string_value, bytes_value=bytes_value) elif isinstance(str1, ExprNodes.BytesNode) and isinstance(str2, ExprNodes.BytesNode): if str1.value.encoding == str2.value.encoding: bytes_value = bytes_literal(node.constant_result, str1.value.encoding) @@ -4565,16 +4513,10 @@ def _multiply_string(self, node, string_node, multiplier_node): # Too long for static creation, leave it to runtime. (-> arbitrary limit) return node - build_string = encoded_string if isinstance(string_node, ExprNodes.BytesNode): build_string = bytes_literal - elif isinstance(string_node, ExprNodes.StringNode): - if string_node.unicode_value is not None: - string_node.unicode_value = encoded_string( - string_node.unicode_value * multiplier, - string_node.unicode_value.encoding) - build_string = encoded_string if string_node.value.is_unicode else bytes_literal elif isinstance(string_node, ExprNodes.UnicodeNode): + build_string = encoded_string if string_node.bytes_value is not None: string_node.bytes_value = bytes_literal( string_node.bytes_value * multiplier, @@ -4585,10 +4527,7 @@ def _multiply_string(self, node, string_node, multiplier_node): string_node.value * multiplier, string_node.value.encoding) # follow constant-folding and use unicode_value in preference - if isinstance(string_node, ExprNodes.StringNode) and string_node.unicode_value is not None: - string_node.constant_result = string_node.unicode_value - else: - string_node.constant_result = string_node.value + string_node.constant_result = string_node.value return string_node def _calculate_constant_seq(self, node, sequence_node, factor): @@ -4635,13 +4574,13 @@ def _build_fstring(self, pos, ustring, format_args): if not s: continue if s == '%%': - substrings.append(ExprNodes.UnicodeNode(pos, value=EncodedString('%'), constant_result='%')) + substrings.append(ExprNodes.UnicodeNode(pos, value=EncodedString('%'))) continue if s[0] != '%': if s[-1] == '%': - warning(pos, "Incomplete format: '...%s'" % s[-3:], level=1) + warning(pos, f"Incomplete format: '...{s[-3:]}'", level=1) can_be_optimised = False - substrings.append(ExprNodes.UnicodeNode(pos, value=EncodedString(s), constant_result=s)) + substrings.append(ExprNodes.UnicodeNode(pos, value=EncodedString(s))) continue format_type = s[-1] try: @@ -4674,8 +4613,7 @@ def _build_fstring(self, pos, ustring, format_args): substrings.append(ExprNodes.FormattedValueNode( arg.pos, value=arg, conversion_char=conversion_char, - format_spec=ExprNodes.UnicodeNode( - pos, value=EncodedString(format_spec), constant_result=format_spec) + format_spec=ExprNodes.UnicodeNode(pos, value=EncodedString(format_spec)) if format_spec else None, )) else: @@ -4700,20 +4638,15 @@ def _build_fstring(self, pos, ustring, format_args): def visit_FormattedValueNode(self, node): self.visitchildren(node) conversion_char = node.conversion_char or 's' - if isinstance(node.format_spec, ExprNodes.UnicodeNode) and not node.format_spec.value: + if node.format_spec is not None and node.format_spec.is_string_literal and not node.format_spec.value: node.format_spec = None if node.format_spec is None and isinstance(node.value, ExprNodes.IntNode): value = EncodedString(node.value.value) if value.isdigit(): - return ExprNodes.UnicodeNode(node.value.pos, value=value, constant_result=value) + return ExprNodes.UnicodeNode(node.value.pos, value=value) if node.format_spec is None and conversion_char == 's': - value = None - if isinstance(node.value, ExprNodes.UnicodeNode): - value = node.value.value - elif isinstance(node.value, ExprNodes.StringNode): - value = node.value.unicode_value - if value is not None: - return ExprNodes.UnicodeNode(node.value.pos, value=value, constant_result=value) + if node.value.is_string_literal: + return node.value return node def visit_JoinedStrNode(self, node): @@ -4723,16 +4656,15 @@ def visit_JoinedStrNode(self, node): because f-string format specs are always parsed into JoinedStrNodes. """ self.visitchildren(node) - unicode_node = ExprNodes.UnicodeNode values = [] - for is_unode_group, substrings in itertools.groupby(node.values, lambda v: isinstance(v, unicode_node)): + for is_unode_group, substrings in itertools.groupby(node.values, key=attrgetter('is_string_literal')): if is_unode_group: substrings = list(substrings) unode = substrings[0] if len(substrings) > 1: value = EncodedString(''.join(value.value for value in substrings)) - unode = ExprNodes.UnicodeNode(unode.pos, value=value, constant_result=value) + unode = ExprNodes.UnicodeNode(unode.pos, value=value) # ignore empty Unicode strings if unode.value: values.append(unode) @@ -4740,8 +4672,7 @@ def visit_JoinedStrNode(self, node): values.extend(substrings) if not values: - value = EncodedString('') - node = ExprNodes.UnicodeNode(node.pos, value=value, constant_result=value) + node = ExprNodes.UnicodeNode(node.pos, value=EncodedString('')) elif len(values) == 1: node = values[0] elif len(values) == 2: diff --git a/Cython/Compiler/Options.py b/Cython/Compiler/Options.py index f6dff5f6bcb..1e10c0e6e27 100644 --- a/Cython/Compiler/Options.py +++ b/Cython/Compiler/Options.py @@ -369,7 +369,7 @@ class DEFER_ANALYSIS_OF_ARGUMENTS: 'exceptval': type, # actually (type, check=True/False), but has its own parser 'set_initial_path': str, 'freelist': int, - 'c_string_type': one_of('bytes', 'bytearray', 'str', 'unicode', map={'str': 'unicode'}), + 'c_string_type': one_of('bytes', 'bytearray', 'str', 'unicode', map={'unicode': 'str'}), 'c_string_encoding': normalise_encoding_name, 'trashcan': bool, 'total_ordering': None, @@ -468,13 +468,13 @@ def parse_directive_value(name, value, relaxed_bool=False): >>> parse_directive_value('c_string_encoding', 'us-ascii') 'ascii' >>> parse_directive_value('c_string_type', 'str') - 'unicode' + 'str' >>> parse_directive_value('c_string_type', 'bytes') 'bytes' >>> parse_directive_value('c_string_type', 'bytearray') 'bytearray' >>> parse_directive_value('c_string_type', 'unicode') - 'unicode' + 'str' >>> parse_directive_value('c_string_type', 'unnicode') Traceback (most recent call last): ValueError: c_string_type directive must be one of ('bytes', 'bytearray', 'str', 'unicode'), got 'unnicode' diff --git a/Cython/Compiler/ParseTreeTransforms.py b/Cython/Compiler/ParseTreeTransforms.py index cfda826ff2a..07b0cbfc54a 100644 --- a/Cython/Compiler/ParseTreeTransforms.py +++ b/Cython/Compiler/ParseTreeTransforms.py @@ -1235,8 +1235,7 @@ def try_to_parse_directive(self, optname, args, kwds, pos): 'The %s directive takes one compile-time integer argument' % optname) return (optname, int(args[0].value)) elif directivetype is str: - if kwds is not None or len(args) != 1 or not isinstance( - args[0], (ExprNodes.StringNode, ExprNodes.UnicodeNode)): + if kwds is not None or len(args) != 1 or not isinstance(args[0], ExprNodes.UnicodeNode): raise PostParseError(pos, 'The %s directive takes one compile-time string argument' % optname) return (optname, str(args[0].value)) @@ -1256,8 +1255,7 @@ def try_to_parse_directive(self, optname, args, kwds, pos): 'The %s directive takes no keyword arguments' % optname) return optname, [ str(arg.value) for arg in args ] elif callable(directivetype): - if kwds is not None or len(args) != 1 or not isinstance( - args[0], (ExprNodes.StringNode, ExprNodes.UnicodeNode)): + if kwds is not None or len(args) != 1 or not isinstance(args[0], ExprNodes.UnicodeNode): raise PostParseError(pos, 'The %s directive takes one compile-time string argument' % optname) return (optname, directivetype(optname, str(args[0].value))) @@ -2581,8 +2579,8 @@ def visit_CStructOrUnionDefNode(self, node): "INIT_ASSIGNMENTS": Nodes.StatListNode(node.pos, stats = init_assignments), "IS_UNION": ExprNodes.BoolNode(node.pos, value = not node.entry.type.is_struct), "MEMBER_TUPLE": ExprNodes.TupleNode(node.pos, args=attributes), - "STR_FORMAT": ExprNodes.StringNode(node.pos, value = EncodedString(str_format)), - "REPR_FORMAT": ExprNodes.StringNode(node.pos, value = EncodedString(str_format.replace("%s", "%r"))), + "STR_FORMAT": ExprNodes.UnicodeNode(node.pos, value = EncodedString(str_format)), + "REPR_FORMAT": ExprNodes.UnicodeNode(node.pos, value = EncodedString(str_format.replace("%s", "%r"))), }, pos = node.pos).stats[0] wrapper_class.class_name = node.name wrapper_class.shadow = True @@ -2787,12 +2785,9 @@ def generate_assignment(self, node, name, value): entry = node.scope.lookup_here(name) lhs = ExprNodes.NameNode( node.pos, - name = EncodedString(name), + name=EncodedString(name), entry=entry) - rhs = ExprNodes.StringNode( - node.pos, - value=value.as_utf8_string(), - unicode_value=value) + rhs = ExprNodes.UnicodeNode(node.pos, value=value) node.body.stats.insert(0, Nodes.SingleAssignmentNode( node.pos, lhs=lhs, @@ -3845,7 +3840,7 @@ def visit_cython_attribute(self, node): if attribute: if attribute == '__version__': from .. import __version__ as version - node = ExprNodes.StringNode(node.pos, value=EncodedString(version)) + node = ExprNodes.UnicodeNode(node.pos, value=EncodedString(version)) elif attribute == 'NULL': node = ExprNodes.NullNode(node.pos) elif attribute in ('set', 'frozenset', 'staticmethod'): diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 75f57b43688..78e44df0595 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -808,18 +808,20 @@ def p_atom_string(s: PyrexScanner): # s.sy == 'BEGIN_STRING' pos = s.position() kind, bytes_value, unicode_value = p_cat_string_literal(s) - if kind == 'c': + if not kind: + return ExprNodes.UnicodeNode(pos, value=unicode_value, bytes_value=bytes_value) + kind_char: cython.Py_UCS4 = kind + if kind_char == 'c': return ExprNodes.CharNode(pos, value=bytes_value) - elif kind == 'u': + elif kind_char == 'u': return ExprNodes.UnicodeNode(pos, value=unicode_value, bytes_value=bytes_value) - elif kind == 'b': + elif kind_char == 'b': return ExprNodes.BytesNode(pos, value=bytes_value) - elif kind == 'f': + elif kind_char == 'f': return ExprNodes.JoinedStrNode(pos, values=unicode_value) - elif kind == '': - return ExprNodes.StringNode(pos, value=bytes_value, unicode_value=unicode_value) else: - s.error("invalid string kind '%s'" % kind) + # This is actually prevented by the scanner (Lexicon.py). + s.error(f"invalid string kind '{kind}'") @cython.cfunc @@ -1359,7 +1361,7 @@ def p_f_string_expr(s: PyrexScanner, unicode_value, pos: tuple, nodes = [] if expr_text: - nodes.append(ExprNodes.UnicodeNode(pos, value=StringEncoding.EncodedString(expr_text))) + nodes.append(ExprNodes.UnicodeNode(pos, value=EncodedString(expr_text))) nodes.append(ExprNodes.FormattedValueNode(pos, value=expr, conversion_char=conversion_char, format_spec=format_spec)) return i + 1, nodes @@ -4132,10 +4134,6 @@ def _extract_docstring(node) -> tuple: warning(node.pos, "Python 3 requires docstrings to be unicode strings") doc = doc_node.value - elif isinstance(doc_node, ExprNodes.StringNode): - doc = doc_node.unicode_value - if doc is None: - doc = doc_node.value else: doc = doc_node.value return doc, node diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py index 84c26d16e87..af4f4e913ec 100644 --- a/Cython/Compiler/PyrexTypes.py +++ b/Cython/Compiler/PyrexTypes.py @@ -1392,7 +1392,7 @@ def nullcheck_string(self, cname): builtin_types_that_cannot_create_refcycles = frozenset({ 'object', 'bool', 'int', 'long', 'float', 'complex', - 'bytearray', 'bytes', 'unicode', 'str', 'basestring', + 'bytearray', 'bytes', 'str', }) builtin_types_with_trashcan = frozenset({ @@ -1452,10 +1452,7 @@ def default_coerced_ctype(self): def assignable_from(self, src_type): if isinstance(src_type, BuiltinObjectType): - if self.name == 'basestring': - return src_type.name in ('str', 'unicode', 'basestring') - else: - return src_type.name == self.name + return src_type.name == self.name elif src_type.is_extension_type: # FIXME: This is an ugly special case that we currently # keep supporting. It allows users to specify builtin @@ -1479,9 +1476,7 @@ def subtype_of(self, type): def type_check_function(self, exact=True): type_name = self.name if type_name == 'str': - type_check = 'PyString_Check' - elif type_name == 'basestring': - type_check = '__Pyx_PyBaseString_Check' + type_check = 'PyUnicode_Check' elif type_name == 'Exception': type_check = '__Pyx_PyException_Check' elif type_name == 'BaseException': @@ -1506,14 +1501,10 @@ def isinstance_code(self, arg): def type_test_code(self, arg, notnone=False, exact=True): type_check = self.type_check_function(exact=exact) - check = 'likely(%s(%s))' % (type_check, arg) + check = f'likely({type_check}({arg}))' if not notnone: - check += '||((%s) == Py_None)' % arg - if self.name == 'basestring': - name = '(PY_MAJOR_VERSION < 3 ? "basestring" : "str")' - else: - name = '"%s"' % self.name - return check + ' || __Pyx_RaiseUnexpectedTypeError(%s, %s)' % (name, arg) + check += f'||(({arg}) == Py_None)' + return check + f' || __Pyx_RaiseUnexpectedTypeError("{self.name}", {arg})' def declaration_code(self, entity_code, for_display = 0, dll_linkage = None, pyrex = 0): diff --git a/Cython/Compiler/Scanning.py b/Cython/Compiler/Scanning.py index 9cba8543c35..99d54bd7640 100644 --- a/Cython/Compiler/Scanning.py +++ b/Cython/Compiler/Scanning.py @@ -112,7 +112,7 @@ def initial_compile_time_env(): # Py2/3 adaptations from functools import reduce benv.declare('reduce', reduce) - benv.declare('unicode', getattr(builtins, 'unicode', getattr(builtins, 'str'))) + benv.declare('unicode', str) benv.declare('long', getattr(builtins, 'long', getattr(builtins, 'int'))) benv.declare('xrange', getattr(builtins, 'xrange', getattr(builtins, 'range'))) diff --git a/Cython/Compiler/Symtab.py b/Cython/Compiler/Symtab.py index 41936b3bc81..f322abe858b 100644 --- a/Cython/Compiler/Symtab.py +++ b/Cython/Compiler/Symtab.py @@ -1227,14 +1227,12 @@ def __init__(self): # which is apparently a special case because it conflicts with C++ bool self.declare_var("bool", py_object_type, None, "((PyObject*)&PyBool_Type)") - def lookup(self, name, language_level=None, str_is_str=None): - # 'language_level' and 'str_is_str' are passed by ModuleScope - if name == 'str': - if str_is_str is None: - str_is_str = language_level in (None, 2) - if not str_is_str: - name = 'unicode' - if name == 'long' and language_level == 2: + def lookup(self, name, language_level=None): + # 'language_level' is passed by ModuleScope + if name == 'unicode' or name == 'basestring': + # Keep recognising 'unicode' and 'basestring' in legacy code but map them to 'str'. + name = 'str' + elif name == 'long' and language_level == 2: # Keep recognising 'long' in legacy Py2 code but map it to 'int'. name = 'int' return Scope.lookup(self, name) @@ -1379,18 +1377,14 @@ def qualifying_scope(self): def global_scope(self): return self - def lookup(self, name, language_level=None, str_is_str=None): + def lookup(self, name, language_level=None): entry = self.lookup_here(name) if entry is not None: return entry if language_level is None: language_level = self.context.language_level if self.context is not None else 3 - if str_is_str is None: - str_is_str = language_level == 2 or ( - self.context is not None and Future.unicode_literals not in self.context.future_directives) - - return self.outer_scope.lookup(name, language_level=language_level, str_is_str=str_is_str) + return self.outer_scope.lookup(name, language_level=language_level) def declare_tuple_type(self, pos, components): components = tuple(components) diff --git a/Cython/Compiler/Visitor.py b/Cython/Compiler/Visitor.py index 4eac5ff9c4a..ae8fc05f088 100644 --- a/Cython/Compiler/Visitor.py +++ b/Cython/Compiler/Visitor.py @@ -581,9 +581,9 @@ def _find_handler(self, match_name, has_kwargs): return None call_type = 'general' if has_kwargs else 'simple' - handler = getattr(self, '_handle_%s_%s' % (call_type, match_name), None) + handler = getattr(self, f'_handle_{call_type}_{match_name}', None) if handler is None: - handler = getattr(self, '_handle_any_%s' % match_name, None) + handler = getattr(self, f'_handle_any_{match_name}', None) return handler def _delegate_to_assigned_value(self, node, function, arg_list, kwargs): @@ -628,7 +628,7 @@ def _dispatch_to_handler(self, node, function, arg_list, kwargs): node=node, function=function, arg_list=arg_list, kwargs=kwargs) return node function_handler = self._find_handler( - "function_%s" % function.name, kwargs) + f"function_{function.name}", kwargs) if function_handler is None: return self._handle_function(node, function.name, function, arg_list, kwargs) if kwargs: @@ -659,6 +659,9 @@ def _dispatch_to_handler(self, node, function, arg_list, kwargs): is_unbound_method = True else: type_name = obj_type.name + if type_name == 'str': + # We traditionally used the type name 'unicode' for 'str' dispatch methods. + type_name = 'unicode' else: type_name = "object" # safety measure return self._dispatch_to_method_handler( @@ -671,12 +674,12 @@ def _dispatch_to_method_handler(self, attr_name, self_arg, is_unbound_method, type_name, node, function, arg_list, kwargs): method_handler = self._find_handler( - "method_%s_%s" % (type_name, attr_name), kwargs) + f"method_{type_name}_{attr_name}", kwargs) if method_handler is None: if (attr_name in TypeSlots.special_method_names or attr_name in ['__new__', '__class__']): method_handler = self._find_handler( - "slot%s" % attr_name, kwargs) + f"slot{attr_name}", kwargs) if method_handler is None: return self._handle_method( node, type_name, attr_name, function, diff --git a/Cython/Includes/cpython/object.pxd b/Cython/Includes/cpython/object.pxd index 41874159ce7..01a84248ffd 100644 --- a/Cython/Includes/cpython/object.pxd +++ b/Cython/Includes/cpython/object.pxd @@ -210,13 +210,6 @@ cdef extern from "Python.h": # Unlike bytes(o), a TypeError is raised when o is an integer # instead of a zero-initialized bytes object. - object PyObject_Unicode(object o) - # Return value: New reference. - # Compute a Unicode string representation of object o. Returns the - # Unicode string representation on success, NULL on failure. This - # is the equivalent of the Python expression "unicode(o)". Called - # by the unicode() built-in function. - bint PyObject_IsInstance(object inst, object cls) except -1 # Returns 1 if inst is an instance of the class cls or a subclass # of cls, or 0 if not. On error, returns -1 and sets an diff --git a/Cython/Shadow.py b/Cython/Shadow.py index 424cd6945f7..6a6c8fd3ab1 100644 --- a/Cython/Shadow.py +++ b/Cython/Shadow.py @@ -472,10 +472,7 @@ def _specialized_from_args(signatures, args, kwargs): gs = globals() -import builtins - -gs['unicode'] = typedef(getattr(builtins, 'unicode', str), 'unicode') -del builtins +gs['unicode'] = typedef(str, 'unicode') for name in int_types: reprname = to_repr(name, name) diff --git a/Cython/Tempita/_looper.py b/Cython/Tempita/_looper.py index 4864f294960..53fb6d11b35 100644 --- a/Cython/Tempita/_looper.py +++ b/Cython/Tempita/_looper.py @@ -18,8 +18,6 @@ """ -basestring_ = (bytes, str) - __all__ = ['looper'] @@ -142,7 +140,7 @@ def last_group(self, getter=None): def _compare_group(self, item, other, getter): if getter is None: return item != other - elif (isinstance(getter, basestring_) + elif (isinstance(getter, str) and getter.startswith('.')): getter = getter[1:] if getter.endswith('()'): diff --git a/Cython/Tempita/_tempita.py b/Cython/Tempita/_tempita.py index c5269f25ff3..114ed47fca7 100644 --- a/Cython/Tempita/_tempita.py +++ b/Cython/Tempita/_tempita.py @@ -42,10 +42,9 @@ def foo(bar): in_re = re.compile(r'\s+in\s+') var_re = re.compile(r'^[a-z_][a-z0-9_]*$', re.I) -basestring_ = (bytes, str) def coerce_text(v): - if not isinstance(v, basestring_): + if not isinstance(v, str): if hasattr(v, '__str__'): return str(v) else: @@ -116,7 +115,7 @@ def __init__(self, content, name=None, namespace=None, stacklevel=None, delimiters = (self.default_namespace['start_braces'], self.default_namespace['end_braces']) else: - #assert len(delimiters) == 2 and all([isinstance(delimiter, basestring) + #assert len(delimiters) == 2 and all([isinstance(delimiter, str) # for delimiter in delimiters]) self.default_namespace = self.__class__.default_namespace.copy() self.default_namespace['start_braces'] = delimiters[0] @@ -220,7 +219,7 @@ def _interpret_inherit(self, body, defs, inherit_template, ns): def _interpret_codes(self, codes, ns, out, defs): __traceback_hide__ = True for item in codes: - if isinstance(item, basestring_): + if isinstance(item, str): out.append(item) else: self._interpret_code(item, ns, out, defs) @@ -291,7 +290,7 @@ def _interpret_if(self, parts, ns, out, defs): __traceback_hide__ = True # @@: if/else/else gets through for part in parts: - assert not isinstance(part, basestring_) + assert not isinstance(part, str) name, pos = part[0], part[1] if name == 'else': result = True @@ -340,7 +339,7 @@ def _repr(self, value, pos): except UnicodeDecodeError: value = bytes(value) else: - if not isinstance(value, basestring_): + if not isinstance(value, str): value = coerce_text(value) if (isinstance(value, str) and self.default_encoding): @@ -629,7 +628,7 @@ def trim_lex(tokens): """ last_trim = None for i, current in enumerate(tokens): - if isinstance(current, basestring_): + if isinstance(current, str): # we don't trim this continue item = current[0] @@ -643,8 +642,8 @@ def trim_lex(tokens): next_chunk = '' else: next_chunk = tokens[i + 1] - if (not isinstance(next_chunk, basestring_) - or not isinstance(prev, basestring_)): + if (not isinstance(next_chunk, str) + or not isinstance(prev, str)): continue prev_ok = not prev or trail_whitespace_re.search(prev) if i == 1 and not prev.strip(): @@ -746,7 +745,7 @@ def parse(s, name=None, line_offset=0, delimiters=None): def parse_expr(tokens, name, context=()): - if isinstance(tokens[0], basestring_): + if isinstance(tokens[0], str): return tokens[0], tokens[1:] expr, pos = tokens[0] expr = expr.strip() diff --git a/Cython/Utility/Builtins.c b/Cython/Utility/Builtins.c index a08e7d7318b..493db1ac945 100644 --- a/Cython/Utility/Builtins.c +++ b/Cython/Utility/Builtins.c @@ -167,7 +167,7 @@ static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject return (res != 0) ? r : __Pyx_NewRef(d); #else #if CYTHON_USE_TYPE_SLOTS - if (likely(PyString_Check(n))) { + if (likely(PyUnicode_Check(n))) { r = __Pyx_PyObject_GetAttrStrNoError(o, n); if (unlikely(!r) && likely(!PyErr_Occurred())) { r = __Pyx_NewRef(d); @@ -194,7 +194,7 @@ static CYTHON_INLINE int __Pyx_HasAttr(PyObject *, PyObject *); /*proto*/ #if __PYX_LIMITED_VERSION_HEX < 0x030d00A1 static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) { PyObject *r; - if (unlikely(!__Pyx_PyBaseString_Check(n))) { + if (unlikely(!PyUnicode_Check(n))) { PyErr_SetString(PyExc_TypeError, "hasattr(): attribute name must be string"); return -1; diff --git a/Cython/Utility/CythonFunction.c b/Cython/Utility/CythonFunction.c index caaca173210..e4c998c0123 100644 --- a/Cython/Utility/CythonFunction.c +++ b/Cython/Utility/CythonFunction.c @@ -1439,7 +1439,7 @@ _obj_to_string(PyObject *obj) else if (PyType_Check(obj)) return PyObject_GetAttr(obj, PYIDENT("__name__")); else - return PyObject_Unicode(obj); + return PyObject_Str(obj); } static PyObject * diff --git a/Cython/Utility/ModuleSetupCode.c b/Cython/Utility/ModuleSetupCode.c index 39e49a528f4..9421bdaf4b6 100644 --- a/Cython/Utility/ModuleSetupCode.c +++ b/Cython/Utility/ModuleSetupCode.c @@ -1042,23 +1042,7 @@ static CYTHON_INLINE PyObject * __Pyx_PyDict_GetItemStrWithError(PyObject *dict, #endif // ("..." % x) must call PyNumber_Remainder() if x is a string subclass that implements "__rmod__()". -#define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b)) #define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b)) -#define __Pyx_PyString_Format(a, b) PyUnicode_Format(a, b) - -// TODO: remove this block -#define PyBaseString_Type PyUnicode_Type -#define PyStringObject PyUnicodeObject -#define PyString_Type PyUnicode_Type -#define PyString_Check PyUnicode_Check -#define PyString_CheckExact PyUnicode_CheckExact -// PyPy3 used to define "PyObject_Unicode" -#ifndef PyObject_Unicode - #define PyObject_Unicode PyObject_Str -#endif - -#define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) -#define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj) #if CYTHON_COMPILING_IN_CPYTHON #define __Pyx_PySequence_ListKeepNew(obj) \ diff --git a/Cython/Utility/ObjectHandling.c b/Cython/Utility/ObjectHandling.c index da6c50ff682..53c8c6000a5 100644 --- a/Cython/Utility/ObjectHandling.c +++ b/Cython/Utility/ObjectHandling.c @@ -2882,18 +2882,6 @@ static CYTHON_INLINE PyObject *__Pyx_PyUnicode_ConcatInPlaceImpl(PyObject **p_le } #endif -////////////// StrConcatInPlace.proto /////////////////////// -//@requires: UnicodeConcatInPlace - -// allow access to the more efficient versions where we know str_type is unicode -#define __Pyx_PyStr_Concat __Pyx_PyUnicode_Concat -#define __Pyx_PyStr_ConcatInPlace __Pyx_PyUnicode_ConcatInPlace - -#define __Pyx_PyStr_ConcatSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \ - PyNumber_Add(a, b) : __Pyx_PyStr_Concat(a, b)) -#define __Pyx_PyStr_ConcatInPlaceSafe(a, b) ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \ - PyNumber_InPlaceAdd(a, b) : __Pyx_PyStr_ConcatInPlace(a, b)) - /////////////// PySequenceMultiply.proto /////////////// @@ -2949,7 +2937,7 @@ __Pyx_PyType_GetName(PyTypeObject* tp) if (unlikely(name == NULL) || unlikely(!PyUnicode_Check(name))) { PyErr_Clear(); Py_XDECREF(name); - name = __Pyx_NewRef(PYIDENT("?")); + name = __Pyx_NewRef(PYUNICODE("?")); } return name; } diff --git a/Cython/Utility/Optimize.c b/Cython/Utility/Optimize.c index 1996c7598d3..b2b755fbe19 100644 --- a/Cython/Utility/Optimize.c +++ b/Cython/Utility/Optimize.c @@ -217,7 +217,7 @@ static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObjec // avoid C compiler warning about unused utility functions if ((1)); #else - if (PyString_CheckExact(key) || PyUnicode_CheckExact(key) || PyLong_CheckExact(key)) { + if (PyBytes_CheckExact(key) || PyUnicode_CheckExact(key) || PyLong_CheckExact(key)) { /* these presumably have safe hash functions */ value = PyDict_GetItem(d, key); if (unlikely(!value)) { @@ -705,15 +705,6 @@ static double __Pyx__PyObject_AsDouble(PyObject* obj) { } -/////////////// pystring_as_double.proto /////////////// -//@requires: pyunicode_as_double - -// TODO: remove -static CYTHON_INLINE double __Pyx_PyString_AsDouble(PyObject *obj) { - return __Pyx_PyUnicode_AsDouble(obj); -} - - /////////////// pyunicode_as_double.proto /////////////// static CYTHON_INLINE double __Pyx_PyUnicode_AsDouble(PyObject *obj);/*proto*/ diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c index ec5a8bb7bc1..edf05c1e740 100644 --- a/Cython/Utility/StringTools.c +++ b/Cython/Utility/StringTools.c @@ -57,7 +57,7 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry const *t, PyObject **target, c static int __Pyx_InitStrings(__Pyx_StringTabEntry const *t, PyObject **target, const char* const* encoding_names) { while (t->s) { PyObject *str; - if (t->is_unicode | t->is_str) { + if (t->is_unicode) { if (t->intern) { str = PyUnicode_InternFromString(t->s); } else if (t->encoding) { @@ -136,13 +136,6 @@ static CYTHON_INLINE int __Pyx_StrEq(const char *s1, const char *s2) { } -//////////////////// StrEquals.proto //////////////////// -//@requires: UnicodeEquals - -// TODO: remove -#define __Pyx_PyString_Equals __Pyx_PyUnicode_Equals - - //////////////////// UnicodeEquals.proto //////////////////// static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals); /*proto*/ @@ -820,26 +813,6 @@ static int __Pyx_PyBytes_Tailmatch(PyObject* self, PyObject* substr, } -/////////////// str_tailmatch.proto /////////////// - -static CYTHON_INLINE int __Pyx_PyStr_Tailmatch(PyObject* self, PyObject* arg, Py_ssize_t start, - Py_ssize_t end, int direction); /*proto*/ - -/////////////// str_tailmatch /////////////// -//@requires: unicode_tailmatch - -// TODO: remove -static CYTHON_INLINE int __Pyx_PyStr_Tailmatch(PyObject* self, PyObject* arg, Py_ssize_t start, - Py_ssize_t end, int direction) -{ - // We do not use a C compiler macro here to avoid "unused function" - // warnings for the *_Tailmatch() function that is not being used in - // the specific CPython version. The C compiler will generate the same - // code anyway, and will usually just remove the unused function. - return __Pyx_PyUnicode_Tailmatch(self, arg, start, end, direction); -} - - /////////////// bytes_index.proto /////////////// static CYTHON_INLINE char __Pyx_PyBytes_GetItemInt(PyObject* bytes, Py_ssize_t index, int check_bounds); /*proto*/ @@ -875,8 +848,6 @@ static CYTHON_INLINE char __Pyx_PyBytes_GetItemInt(PyObject* bytes, Py_ssize_t i //////////////////// StringJoin.proto //////////////////// -#define __Pyx_PyString_Join PyUnicode_Join -#define __Pyx_PyBaseString_Join PyUnicode_Join static CYTHON_INLINE PyObject* __Pyx_PyBytes_Join(PyObject* sep, PyObject* values); /*proto*/ //////////////////// StringJoin //////////////////// @@ -1233,23 +1204,3 @@ static CYTHON_INLINE PyObject* __Pyx_PyUnicode_Unicode(PyObject *obj) { #define __Pyx_PyObject_Unicode(obj) \ (likely(PyUnicode_CheckExact(obj)) ? __Pyx_NewRef(obj) : PyObject_Str(obj)) - - -//////////////////// PyStr_Str.proto //////////////////// - -static CYTHON_INLINE PyObject* __Pyx_PyStr_Str(PyObject *obj);/*proto*/ - -//////////////////// PyStr_Str //////////////////// - -static CYTHON_INLINE PyObject* __Pyx_PyStr_Str(PyObject *obj) { - if (unlikely(obj == Py_None)) - obj = PYIDENT("None"); - return __Pyx_NewRef(obj); -} - - -//////////////////// PyObject_Str.proto //////////////////// - -#define __Pyx_PyObject_Str(obj) \ - (likely(PyString_CheckExact(obj)) ? __Pyx_NewRef(obj) : PyObject_Str(obj)) - diff --git a/tests/errors/string_assignments.pyx b/tests/errors/string_assignments.pyx index 84b3c8766f1..9fd9723e6a8 100644 --- a/tests/errors/string_assignments.pyx +++ b/tests/errors/string_assignments.pyx @@ -77,38 +77,33 @@ print c1[1:2] _ERRORS = u""" 36:20: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings). 37:20: Unicode objects only support coercion to Py_UNICODE*. -38:20: 'str' objects do not support coercion to C types (use 'bytes'?). +38:20: Unicode objects only support coercion to Py_UNICODE*. 40:25: Cannot assign type 'char *' to 'Py_UNICODE *' -41:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'. -42:25: 'str' objects do not support coercion to C types (use 'unicode'?). -43:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'. +41:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'str'. +43:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'str'. 45:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required. 46:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required. -47:20: Cannot convert 'str' to 'bytes' implicitly. This is not portable. -48:20: Cannot convert 'basestring' object to bytes implicitly. This is not portable. - -50:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3. -51:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3. -52:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding. -53:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding. - -55:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'') -56:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'') -57:20: Cannot convert 'bytes' object to unicode implicitly, decoding required -58:20: Cannot convert 'bytes' object to unicode implicitly, decoding required +47:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required. +48:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required. + +50:17: Cannot convert 'bytes' object to str implicitly, decoding required +51:17: Cannot convert 'bytes' object to str implicitly, decoding required + +57:20: Cannot convert 'bytes' object to str implicitly, decoding required +58:20: Cannot convert 'bytes' object to str implicitly, decoding required 59:20: Cannot convert 'char*' to unicode implicitly, decoding required -61:24: Cannot convert 'bytes' object to basestring implicitly. This is not portable to Py3. -62:24: Cannot convert 'bytes' object to basestring implicitly. This is not portable to Py3. +61:24: Cannot convert 'bytes' object to str implicitly, decoding required +62:24: Cannot convert 'bytes' object to str implicitly, decoding required 64:19: Cannot assign type 'str object' to 'tuple object' -65:18: Cannot assign type 'unicode object' to 'tuple object' +65:18: Cannot assign type 'str object' to 'tuple object' 66:18: Cannot assign type 'bytes object' to 'tuple object' -72:11: default encoding required for conversion from 'char *' to 'str object' +72:11: Cannot convert 'char*' to unicode implicitly, decoding required 73:13: default encoding required for conversion from 'char *' to 'str object' 74:15: Cannot convert 'char*' to unicode implicitly, decoding required -75:17: default encoding required for conversion from 'char *' to 'unicode object' +75:17: default encoding required for conversion from 'char *' to 'str object' """ diff --git a/tests/run/builtin_basestring.pyx b/tests/run/builtin_basestring.pyx index 65eb35a35e0..5ad7f99b0a6 100644 --- a/tests/run/builtin_basestring.pyx +++ b/tests/run/builtin_basestring.pyx @@ -23,7 +23,8 @@ def basestring_is_unicode_in_py3(): >>> basestring_is_unicode_in_py3() True """ - return basestring is unicode + object_type = basestring + return object_type is unicode def unicode_subtypes_basestring(): @@ -75,7 +76,7 @@ def basestring_typed_argument(basestring obj): return obj -@cython.test_assert_path_exists( +@cython.test_fail_if_path_exists( "//SimpleCallNode", "//SimpleCallNode//NoneCheckNode", "//SimpleCallNode//AttributeNode[@is_py_attr = false]") diff --git a/tests/run/cstringmul.pyx b/tests/run/cstringmul.pyx index 1932e8d615d..368cf475f14 100644 --- a/tests/run/cstringmul.pyx +++ b/tests/run/cstringmul.pyx @@ -1,3 +1,6 @@ +# mode: run +# cython: language_level=2 + __doc__ = u""" >>> print(spam) eggseggseggseggs @@ -34,8 +37,8 @@ ugrail_long = 700 * u"tomato" cimport cython -@cython.test_assert_path_exists("//StringNode[@value = '-----']") -@cython.test_assert_path_exists("//StringNode[@unicode_value = '-----']") +@cython.test_assert_path_exists("//UnicodeNode[@value = '-----']") +@cython.test_assert_path_exists("//UnicodeNode[@bytes_value = b'-----']") def gh3951(): """ Bug occurs with language_level=2 and affects StringNode.value diff --git a/tests/run/strmethods.pyx b/tests/run/strmethods.pyx index 77730929cf8..bd697132d0a 100644 --- a/tests/run/strmethods.pyx +++ b/tests/run/strmethods.pyx @@ -5,9 +5,7 @@ cimport cython # Also used by the language_level=2 tests in "strmethods_ll2.pyx" -assert cython.typeof(1 / 2) in ('long', 'double') -IS_LANGUAGE_LEVEL_3 = cython.typeof(1 / 2) == 'double' -str_type = "unicode object" if IS_LANGUAGE_LEVEL_3 else "str object" +str_type = "str object" @cython.test_assert_path_exists( diff --git a/tests/run/type_inference.pyx b/tests/run/type_inference.pyx index 6bb5f248560..40424eb1418 100644 --- a/tests/run/type_inference.pyx +++ b/tests/run/type_inference.pyx @@ -9,7 +9,7 @@ from cpython cimport bool assert typeof(1 / 2) in ('long', 'double') IS_LANGUAGE_LEVEL_3 = typeof(1 / 2) == 'double' -str_type = "unicode object" if IS_LANGUAGE_LEVEL_3 else "str object" +str_type = "str object" ################################################## # type inference tests in 'full' mode (infer_types=True) @@ -34,7 +34,7 @@ def simple(): s = "abc" assert typeof(s) == str_type, (typeof(s), str_type) u = u"xyz" - assert typeof(u) == "unicode object", typeof(u) + assert typeof(u) == "str object", typeof(u) L = [1,2,3] assert typeof(L) == "list object", typeof(L) t = (4,5,6,()) @@ -48,8 +48,10 @@ def builtin_types(): """ b = bytes() assert typeof(b) == "bytes object", typeof(b) - u = unicode() - assert typeof(u) == "unicode object", typeof(u) + s = str() + assert typeof(s) == "str object", typeof(u) + u = unicode() # legacy name is still available + assert typeof(u) == "str object", typeof(u) L = list() assert typeof(L) == "list object", typeof(L) t = tuple() @@ -71,11 +73,11 @@ def slicing(): assert typeof(b2) == "bytes object", typeof(b2) u = u"xyz" - assert typeof(u) == "unicode object", typeof(u) + assert typeof(u) == "str object", typeof(u) u1 = u[1:2] - assert typeof(u1) == "unicode object", typeof(u1) + assert typeof(u1) == "str object", typeof(u1) u2 = u[1:2:2] - assert typeof(u2) == "unicode object", typeof(u2) + assert typeof(u2) == "str object", typeof(u2) s = "xyz" assert typeof(s) == str_type, (typeof(s), str_type) @@ -109,14 +111,14 @@ def indexing(): assert typeof(b1) == "Python object", typeof(b1) # Py2: bytes, Py3: int u = u"xyz" - assert typeof(u) == "unicode object", typeof(u) + assert typeof(u) == "str object", typeof(u) u1 = u[1] assert typeof(u1) == "Py_UCS4", typeof(u1) s = "xyz" assert typeof(s) == str_type, (typeof(s), str_type) s1 = s[1] - assert typeof(s1) == ("Py_UCS4" if IS_LANGUAGE_LEVEL_3 else "str object"), (typeof(s1), str_type) + assert typeof(s1) == "Py_UCS4", (typeof(s1), str_type) L = [1,2,3] assert typeof(L) == "list object", typeof(L) @@ -141,7 +143,7 @@ def indexing(): t5_1 = t5[1] assert typeof(t5_1) == str_type, (typeof(t5_1), str_type) t5_2 = t5[2] - assert typeof(t5_2) == "unicode object", typeof(t5_2) + assert typeof(t5_2) == "str object", typeof(t5_2) t5_3 = t5[t[0]-3] assert typeof(t5_3) == "Python object", typeof(t5_3) @@ -232,12 +234,12 @@ def builtin_type_operations(): u1 = u'a' * 10 u1 = 10 * u'a' - assert typeof(u1) == "unicode object", typeof(u1) + assert typeof(u1) == "str object", typeof(u1) u2 = u'a' + u'b' - assert typeof(u2) == "unicode object", typeof(u2) + assert typeof(u2) == "str object", typeof(u2) u3 = u'a%s' % u'b' u3 = u'a%s' % 10 - assert typeof(u3) == "unicode object", typeof(u3) + assert typeof(u3) == "str object", typeof(u3) s1 = "abc %s" % "x" s1 = "abc %s" % 10 @@ -283,23 +285,23 @@ def builtin_type_methods(): assert typeof(split) == 'list object', typeof(split) str_result1 = u.upper() - assert typeof(str_result1) == 'unicode object', typeof(str_result1) + assert typeof(str_result1) == "str object", typeof(str_result1) str_result2 = u.upper().lower() - assert typeof(str_result2) == 'unicode object', typeof(str_result2) + assert typeof(str_result2) == "str object", typeof(str_result2) str_result3 = u.upper().lower().strip() - assert typeof(str_result3) == 'unicode object', typeof(str_result3) + assert typeof(str_result3) == "str object", typeof(str_result3) str_result4 = u.upper().lower().strip().lstrip() - assert typeof(str_result4) == 'unicode object', typeof(str_result4) + assert typeof(str_result4) == "str object", typeof(str_result4) str_result5 = u.upper().lower().strip().lstrip().rstrip() - assert typeof(str_result5) == 'unicode object', typeof(str_result5) + assert typeof(str_result5) == "str object", typeof(str_result5) str_result6 = u.upper().lower().strip().lstrip().rstrip().center(20) - assert typeof(str_result6) == 'unicode object', typeof(str_result6) + assert typeof(str_result6) == "str object", typeof(str_result6) str_result7 = u.upper().lower().strip().lstrip().rstrip().center(20).format() - assert typeof(str_result7) == 'unicode object', typeof(str_result7) + assert typeof(str_result7) == "str object", typeof(str_result7) str_result8 = u.upper().lower().strip().lstrip().rstrip().center(20).format().expandtabs(4) - assert typeof(str_result8) == 'unicode object', typeof(str_result8) + assert typeof(str_result8) == "str object", typeof(str_result8) str_result9 = u.upper().lower().strip().lstrip().rstrip().center(20).format().expandtabs(4).swapcase() - assert typeof(str_result9) == 'unicode object', typeof(str_result9) + assert typeof(str_result9) == "str object", typeof(str_result9) predicate1 = u.isupper() assert typeof(predicate1) == 'bint', typeof(predicate1) @@ -457,7 +459,7 @@ def loop_over_str(): # str (bytes) in Py2, str (unicode) in Py3 for c in string: pass - assert ((typeof(c) == 'Py_UCS4') if IS_LANGUAGE_LEVEL_3 else (typeof(c) == 'str object')), typeof(c) + assert typeof(c) == 'Py_UCS4', typeof(c) def loop_over_unicode(): """ @@ -832,7 +834,7 @@ def int64_long_sum(): cdef class InferInProperties: """ >>> InferInProperties().x - ('double', 'unicode object', 'MyEnum', 'MyEnum') + ('double', 'str object', 'MyEnum', 'MyEnum') """ cdef MyEnum attr def __cinit__(self): diff --git a/tests/run/unicodemethods.pyx b/tests/run/unicodemethods.pyx index dd2aebeafab..f893a7100f7 100644 --- a/tests/run/unicodemethods.pyx +++ b/tests/run/unicodemethods.pyx @@ -236,7 +236,7 @@ def join_sep(l): ab|jd|sdflk|as|sa|sadas|asdas|fsdf """ result = u'|'.join(l) - assert cython.typeof(result) == 'unicode object', cython.typeof(result) + assert cython.typeof(result) == "str object", cython.typeof(result) return result @@ -263,7 +263,7 @@ def join_sep_genexpr(l): <> """ result = u'|'.join(s + u' ' for s in l) - assert cython.typeof(result) == 'unicode object', cython.typeof(result) + assert cython.typeof(result) == "str object", cython.typeof(result) return result @@ -288,7 +288,7 @@ def join_sep_genexpr_dictiter(dict d): 0:ab|1:jd|2:sdflk|3:as|4:sa|5:sadas|6:asdas|7:fsdf """ result = u' '.join('%s:%s' % (k, v) for k, v in d.iteritems()) - assert cython.typeof(result) == 'unicode object', cython.typeof(result) + assert cython.typeof(result) == "str object", cython.typeof(result) return result @@ -512,7 +512,7 @@ def concat(unicode s, str suffix): TypeError: ... """ assert cython.typeof(s + object()) == 'Python object', cython.typeof(s + object()) - assert cython.typeof(s + suffix) == 'unicode object', cython.typeof(s + suffix) + assert cython.typeof(s + suffix) == "str object", cython.typeof(s + suffix) return s + suffix @@ -525,7 +525,7 @@ def concat_literal_str(str suffix): TypeError: ...NoneType... """ assert cython.typeof(u'abc' + object()) == 'Python object', cython.typeof(u'abc' + object()) - assert cython.typeof(u'abc' + suffix) == 'unicode object', cython.typeof(u'abc' + suffix) + assert cython.typeof(u'abc' + suffix) == "str object", cython.typeof(u'abc' + suffix) return u'abc' + suffix @@ -537,7 +537,7 @@ def concat_literal_unicode(unicode suffix): Traceback (most recent call last): TypeError: ...NoneType... """ - assert cython.typeof(u'abc' + suffix) == 'unicode object', cython.typeof(u'abc' + suffix) + assert cython.typeof(u'abc' + suffix) == "str object", cython.typeof(u'abc' + suffix) return u'abc' + suffix @@ -573,7 +573,7 @@ def mod_format_literal(values): >>> mod_format_literal(['sa']) == "abc['sa']def" or mod_format(format1, ['sa']) True """ - assert cython.typeof(u'abc%sdef' % values) == 'unicode object', cython.typeof(u'abc%sdef' % values) + assert cython.typeof(u'abc%sdef' % values) == "str object", cython.typeof(u'abc%sdef' % values) return u'abc%sdef' % values @@ -585,7 +585,7 @@ def mod_format_tuple(*values): Traceback (most recent call last): TypeError: not enough arguments for format string """ - assert cython.typeof(u'abc%sdef' % values) == 'unicode object', cython.typeof(u'abc%sdef' % values) + assert cython.typeof(u'abc%sdef' % values) == "str object", cython.typeof(u'abc%sdef' % values) return u'abc%sdef' % values