Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH some steps to make cloudpickle dynamic function/classes more deterministic #524

Merged
merged 20 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ci/install_coverage_subprocess_pth.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import coverage; coverage.process_startup()
"""

filename = op.join(get_path('purelib'), 'coverage_subprocess.pth')
with open(filename, 'wb') as f:
f.write(FILE_CONTENT.encode('ascii'))
filename = op.join(get_path("purelib"), "coverage_subprocess.pth")
with open(filename, "wb") as f:
f.write(FILE_CONTENT.encode("ascii"))

print('Installed subprocess coverage support: %s' % filename)
print("Installed subprocess coverage support: %s" % filename)
75 changes: 52 additions & 23 deletions cloudpickle/cloudpickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _lookup_class_or_track(class_tracker_id, class_def):


def register_pickle_by_value(module):
"""Register a module to make it functions and classes picklable by value.
"""Register a module to make its functions and classes picklable by value.

By default, functions and classes that are attributes of an importable
module are to be pickled by reference, that is relying on re-importing
Expand Down Expand Up @@ -369,7 +369,7 @@ def func():
# sys.modules.
if name is not None and name.startswith(prefix):
# check whether the function can address the sub-module
tokens = set(name[len(prefix) :].split("."))
tokens = set(name[len(prefix):].split("."))
if not tokens - set(code.co_names):
subimports.append(sys.modules[name])
return subimports
Expand Down Expand Up @@ -409,7 +409,9 @@ def _walk_global_ops(code):

def _extract_class_dict(cls):
"""Retrieve a copy of the dict of a class without the inherited method."""
clsdict = dict(cls.__dict__) # copy dict proxy to a dict
# copy dict proxy to a dict. Sort the keys to make the pickle deterministic
clsdict = {k: cls.__dict__[k] for k in sorted(cls.__dict__)}

if len(cls.__bases__) == 1:
inherited_dict = cls.__bases__[0].__dict__
else:
Expand Down Expand Up @@ -533,9 +535,15 @@ class id will also reuse this class definition.
The "extra" variable is meant to be a dict (or None) that can be used for
forward compatibility shall the need arise.
"""
# We need to intern the keys of the type_kwargs dict to avoid having
# different pickles for the same dynamic class depending on whether it was
# dynamically created or reconstructed from a pickled stream.
type_kwargs = {sys.intern(k): v for k, v in type_kwargs.items()}
ogrisel marked this conversation as resolved.
Show resolved Hide resolved

skeleton_class = types.new_class(
name, bases, {"metaclass": type_constructor}, lambda ns: ns.update(type_kwargs)
)

return _lookup_class_or_track(class_tracker_id, skeleton_class)


Expand Down Expand Up @@ -694,7 +702,9 @@ def _function_getstate(func):
# unpickling time by iterating over slotstate and calling setattr(func,
# slotname, slotvalue)
slotstate = {
"__name__": func.__name__,
# Intern the function names to be consistent with the method names that are
# automatically interned with `setattr`. This is only the case for cpython.
"__name__": sys.intern(func.__name__) if not PYPY else func.__name__,
"__qualname__": func.__qualname__,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any idea why we don't need this treatment for the __qualname__ value nor for the keys of func.__dict__?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, at least for the latter, there are still things to solver. I will push a slight change to the test to show what I mean.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue with string interning is mostly due to collision between the string defined in two different places: typically method names that are interned as class attributes, vs the dynamic func name.
So I would say this won't happen for __qualname__ as this is only present here but maybe I am wrong.

"__annotations__": func.__annotations__,
"__kwdefaults__": func.__kwdefaults__,
Expand Down Expand Up @@ -802,6 +812,19 @@ def _code_reduce(obj):
# of the specific type from types, for example:
# >>> from types import CodeType
# >>> help(CodeType)

# We need to intern the function names to be consistent with the method name,
# which are interned automatically with `setattr`. This is only the case for cpython.
co_name = sys.intern(obj.co_name) if not PYPY else obj.co_name

# Create copies of these tuple to make cloudpickle payload deterministic.
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
# When creating a code object during load, copies of these four tuples are
# created, while in the main process, these tuples can be shared.
# By always creating copies, we make sure the resulting payload is deterministic.
co_names = tuple(name for name in obj.co_names)
co_varnames = tuple(name for name in obj.co_varnames)
co_freevars = tuple(name for name in obj.co_freevars)
co_cellvars = tuple(name for name in obj.co_cellvars)
if hasattr(obj, "co_exceptiontable"):
# Python 3.11 and later: there are some new attributes
# related to the enhanced exceptions.
Expand All @@ -814,16 +837,16 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_names,
obj.co_varnames,
co_names,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_qualname,
obj.co_firstlineno,
obj.co_linetable,
obj.co_exceptiontable,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
)
elif hasattr(obj, "co_linetable"):
# Python 3.10 and later: obj.co_lnotab is deprecated and constructor
Expand All @@ -837,14 +860,14 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_names,
obj.co_varnames,
co_names,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_firstlineno,
obj.co_linetable,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
)
elif hasattr(obj, "co_nmeta"): # pragma: no cover
# "nogil" Python: modified attributes from 3.9
Expand All @@ -859,15 +882,15 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_varnames,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_firstlineno,
obj.co_lnotab,
obj.co_exc_handlers,
obj.co_jump_table,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
obj.co_free2reg,
obj.co_cell2reg,
)
Expand All @@ -882,14 +905,14 @@ def _code_reduce(obj):
obj.co_flags,
obj.co_code,
obj.co_consts,
obj.co_names,
obj.co_varnames,
co_names,
co_varnames,
obj.co_filename,
obj.co_name,
co_name,
obj.co_firstlineno,
obj.co_lnotab,
obj.co_freevars,
obj.co_cellvars,
co_freevars,
co_cellvars,
)
return types.CodeType, args

Expand Down Expand Up @@ -1127,6 +1150,12 @@ def _class_setstate(obj, state):
if attrname == "_abc_impl":
registry = attr
else:
# Note: attribute names are automatically interned in cpython. This means that to get
# determinist pickling in subprocess, we need to make sure that the dynamic function names
# are also interned since the Pickler's memoizer relies on physical object
# identity to break cycles in the reference graph of the object being
# serialized.
# https://github.com/python/cpython/blob/main/Objects/object.c#L1060
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
setattr(obj, attrname, attr)
if registry is not None:
for subclass in registry:
Expand Down
1 change: 1 addition & 0 deletions cloudpickle/cloudpickle_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

See: tests/test_backward_compat.py
"""

from . import cloudpickle


Expand Down
3 changes: 3 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import pytest

pytest.register_assert_rewrite("tests.testutils")
134 changes: 131 additions & 3 deletions tests/cloudpickle_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@
from cloudpickle.cloudpickle import _extract_class_dict, _whichmodule
from cloudpickle.cloudpickle import _lookup_module_and_qualname

from .testutils import subprocess_worker
from .testutils import subprocess_pickle_echo
from .testutils import subprocess_pickle_string
from .testutils import assert_run_python_script
from .testutils import subprocess_worker
from .testutils import check_determinist_pickle
tomMoral marked this conversation as resolved.
Show resolved Hide resolved


_TEST_GLOBAL_VARIABLE = "default_value"
Expand Down Expand Up @@ -108,7 +109,7 @@ def method_c(self):
return "c"

clsdict = _extract_class_dict(C)
assert sorted(clsdict.keys()) == ["C_CONSTANT", "__doc__", "method_c"]
assert list(clsdict.keys()) == ["C_CONSTANT", "__doc__", "method_c"]
assert clsdict["C_CONSTANT"] == 43
assert clsdict["__doc__"] is None
assert clsdict["method_c"](C()) == C().method_c()
Expand Down Expand Up @@ -1951,7 +1952,6 @@ def lookup(obj_id):

class A:
'''Updated class definition'''
pass

assert not w.run(lambda obj_id: isinstance(lookup(obj_id), A), id1)
retrieved1 = w.run(lookup, id1)
Expand Down Expand Up @@ -1983,6 +1983,134 @@ class A:
""".format(protocol=self.protocol)
assert_run_python_script(code)

def test_dynamic_func_determinist(self):
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
# Check that the pickle serialization for a dynamic func is the same
# in two processes.

def get_dynamic_func_pickle():
def test_method(arg_1, arg_2):
pass
return cloudpickle.dumps(test_method)

with subprocess_worker(protocol=self.protocol) as w:

A_dump = w.run(get_dynamic_func_pickle)
check_determinist_pickle(A_dump, get_dynamic_func_pickle())
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

def test_dynamic_class_determinist(self):
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
# Check that the pickle serialization for a dynamic class is the same
# in two processes.
pytest.xfail("This test fails due to different tracker_id.")

def get_dynamic_class_pickle():
class A:
"""Class with potential string interning issues."""

arg_1 = "class_value"

def join(self):
pass

def test_method(self, arg_1, join):
pass
return cloudpickle.dumps(A)

with subprocess_worker(protocol=self.protocol) as w:

A_dump = w.run(get_dynamic_class_pickle)
check_determinist_pickle(A_dump, get_dynamic_class_pickle())
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

def test_dynamic_class_determinist_subworker_order(self):
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
# Check that the pickle produced by the unpickled instance is the same.
# This checks that the order of the class attributes is deterministic.
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

with subprocess_worker(protocol=self.protocol) as w:

class A:
"""Simple class definition"""

pass

A_dump = w.run(cloudpickle.dumps, A)
check_determinist_pickle(A_dump, cloudpickle.dumps(A))
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

# If the doc is defined after some class variables, this can cause ordering changes
# due to the way we reconstruct the class with _make_class_skeleton, which create
# the class and thus `__doc__` before populating it.
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
class A:
name = "A"
__doc__ = "Updated class definition"

A_dump = w.run(cloudpickle.dumps, A)
check_determinist_pickle(A_dump, cloudpickle.dumps(A))
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

# If the doc is defined in `__init__`, this can cause ordering changes due to the way
# we reconstruct the class with _make_class_skeleton. Make sure the order is deterministic
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
class A:
def __init__(self):
"""Class definition with explicit __init__"""
pass

A_dump = w.run(cloudpickle.dumps, A)
check_determinist_pickle(A_dump, cloudpickle.dumps(A))
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

def test_dynamic_class_determinist_subworker_str_interning(self):
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
# Check that the pickle produced by the unpickled instance is the same.
# This checks that there is no issue due to string interning.
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

with subprocess_worker(protocol=self.protocol) as w:
# Due to interning of class attributes, check that this does not create issues
# with dynamic function definition.
class A:
"""Class with potential string interning issues."""

arg_1 = "class_value"

def join(self):
pass

def test_method(self, arg_1, join):
pass

A_dump = w.run(cloudpickle.dumps, A)
check_determinist_pickle(A_dump, cloudpickle.dumps(A))

# XXX - this does not seem to work, and I am not sure there is an easy fix.
class A:
"""Class with potential string interning issues."""

arg_1 = "join"

def join(self, arg_1):
pass

A_dump = w.run(cloudpickle.dumps, A)
pytest.xfail(
"This test is expected to fail due to string interning errors."
)
tomMoral marked this conversation as resolved.
Show resolved Hide resolved
check_determinist_pickle(A_dump, cloudpickle.dumps(A))

def test_dynamic_class_determinist_subworker_tuple_memoization(self):
# Check that the pickle produced by the unpickled instance is the same.
# This highlights some issues with tuple memoization.

with subprocess_worker(protocol=self.protocol) as w:
# Arguments' tuple is memoized in the main process but not in the subprocess
# as the tuples do not share the same id in the loaded class.

# XXX - this does not seem to work, and I am not sure there is an easy fix.
class A:
"""Class with potential tuple memoization issues."""

def func1(self,):
pass

def func2(self,):
pass

A_dump = w.run(cloudpickle.dumps, A)
check_determinist_pickle(A_dump, cloudpickle.dumps(A))
tomMoral marked this conversation as resolved.
Show resolved Hide resolved

@pytest.mark.skipif(
platform.python_implementation() == "PyPy",
reason="Skip PyPy because memory grows too much",
Expand Down
5 changes: 4 additions & 1 deletion tests/cloudpickle_testpkg/_cloudpickle_testpkg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,22 @@ def relative_imports_factory():
Relative import of functions living both inside modules and packages are
tested.
"""

def f():
# module_function belongs to _cloudpickle_testpkg.mod, which is a
# module
from .mod import module_function

return module_function()

def g():
# package_function belongs to _cloudpickle_testpkg, which is a package
from . import package_function

return package_function()

return f, g


some_singleton = _SingletonClass()
T = typing.TypeVar('T')
T = typing.TypeVar("T")
Loading