pypa · jaraco · Jan 7, 2024 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/distutils/tests/test_dist.py b/distutils/tests/test_dist.py
@@ -1,6 +1,9 @@
 """Tests for distutils.dist."""
 import os
 import io
+import email
+import email.policy
+import email.generator
 import sys
 import warnings
 import textwrap
@@ -510,3 +513,41 @@ def test_read_metadata(self):
         assert metadata.platforms is None
         assert metadata.obsoletes is None
         assert metadata.requires == ['foo']
+
+    def test_round_trip_through_email_generator(self):
+        """
+        In pypa/setuptools#4033, it was shown that once PKG-INFO is
+        re-generated using ``email.generator.Generator``, some control
+        characters might cause problems.
+        """
+        # Given a PKG-INFO file ...
+        attrs = {
+            "name": "package",
+            "version": "1.0",
+            "long_description": "hello\x0b\nworld\n",
+        }
+        dist = Distribution(attrs)
+        metadata = dist.metadata
+
+        with io.StringIO() as buffer:
+            metadata.write_pkg_file(buffer)
+            msg = buffer.getvalue()
+
+        # ... when it is read and re-written using stdlib's email library,
+        orig = email.message_from_string(msg)
+        policy = email.policy.EmailPolicy(
+            utf8=True,
+            mangle_from_=False,
+            max_line_length=0,
+        )
+        with io.StringIO() as buffer:
+            email.generator.Generator(buffer, policy=policy).flatten(orig)
+
+            buffer.seek(0)
+            regen = email.message_from_file(buffer)
+
+        # ... then it should be the same as the original
+        # (except for the specific line break characters)
+        orig_desc = set(orig["Description"].splitlines())
+        regen_desc = set(regen["Description"].splitlines())
+        assert regen_desc == orig_desc
diff --git a/distutils/tests/test_util.py b/distutils/tests/test_util.py
@@ -1,4 +1,8 @@
 """Tests for distutils.util."""
+import email
+import email.policy
+import email.generator
+import io
 import os
 import sys
 import sysconfig as stdlib_sysconfig
@@ -184,12 +188,55 @@ def test_strtobool(self):
         for n in no:
             assert not strtobool(n)
 
-    def test_rfc822_escape(self):
-        header = 'I am a\npoor\nlonesome\nheader\n'
-        res = rfc822_escape(header)
-        wanted = ('I am a%(8s)spoor%(8s)slonesome%(8s)s' 'header%(8s)s') % {
-            '8s': '\n' + 8 * ' '
-        }
+    indent = 8 * ' '
+
+    @pytest.mark.parametrize(
+        "given,wanted",
+        [
+            # 0x0b, 0x0c, ..., etc are also considered a line break by Python
+            ("hello\x0b\nworld\n", f"hello\x0b{indent}\n{indent}world\n{indent}"),
+            ("hello\x1eworld", f"hello\x1e{indent}world"),
+            ("", ""),
+            (
+                "I am a\npoor\nlonesome\nheader\n",
+                f"I am a\n{indent}poor\n{indent}lonesome\n{indent}header\n{indent}",
+            ),
+        ],
+    )
+    def test_rfc822_escape(self, given, wanted):
+        """
+        We want to ensure a multi-line header parses correctly.
+
+        For interoperability, the escaped value should also "round-trip" over
+        `email.generator.Generator.flatten` and `email.message_from_*`
+        (see pypa/setuptools#4033).
+
+        The main issue is that internally `email.policy.EmailPolicy` uses
+        `splitlines` which will split on some control chars. If all the new lines
+        are not prefixed with spaces, the parser will interrupt reading
+        the current header and produce an incomplete value, while
+        incorrectly interpreting the rest of the headers as part of the payload.
+        """
+        res = rfc822_escape(given)
+
+        policy = email.policy.EmailPolicy(
+            utf8=True,
+            mangle_from_=False,
+            max_line_length=0,
+        )
+        with io.StringIO() as buffer:
+            raw = f"header: {res}\nother-header: 42\n\npayload\n"
+            orig = email.message_from_string(raw)
+            email.generator.Generator(buffer, policy=policy).flatten(orig)
+            buffer.seek(0)
+            regen = email.message_from_file(buffer)
+
+        for msg in (orig, regen):
+            assert msg.get_payload() == "payload\n"
+            assert msg["other-header"] == "42"
+            # Generator may replace control chars with `\n`
+            assert set(msg["header"].splitlines()) == set(res.splitlines())
+
         assert res == wanted
 
     def test_dont_write_bytecode(self):

diff --git a/distutils/util.py b/distutils/util.py
@@ -508,6 +508,12 @@ def rfc822_escape(header):
     """Return a version of the string escaped for inclusion in an
     RFC-822 header, by ensuring there are 8 spaces space after each newline.
     """
-    lines = header.split('\n')
-    sep = '\n' + 8 * ' '
-    return sep.join(lines)
+    indent = 8 * " "
+    lines = header.splitlines(keepends=True)
+
+    # Emulate the behaviour of `str.split`
+    # (the terminal line break in `splitlines` does not result in an extra line):
+    ends_in_newline = lines and lines[-1].splitlines()[0] != lines[-1]
+    suffix = indent if ends_in_newline else ""
+
+    return indent.join(lines) + suffix