Support files with mixed newlines (#1007)

* Add test case with mixed newlines * Split lines by any newline character and not just by default * Add unit test, remove copied
Instagram · Sep 2, 2023 · 9c263aa · 9c263aa
1 parent 9286446
commit 9c263aa
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 6 deletions.
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/libcst/Cargo.toml b/native/libcst/Cargo.toml
@@ -35,6 +35,7 @@ thiserror = "1.0.37"
 peg = "0.8.1"
 chic = "1.2.2"
 regex = "1.9.3"
+memchr = "2.5.0"
 libcst_derive = { path = "../libcst_derive" }
 
 [dev-dependencies]

diff --git a/native/libcst/src/tokenizer/whitespace_parser.rs b/native/libcst/src/tokenizer/whitespace_parser.rs
@@ -7,6 +7,7 @@ use crate::nodes::{
     Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace,
     SimpleWhitespace, TrailingWhitespace,
 };
+use memchr::memchr2_iter;
 use regex::Regex;
 use thiserror::Error;
 
@@ -16,9 +17,8 @@ use super::TokType;
 
 thread_local! {
     static SIMPLE_WHITESPACE_RE: Regex = Regex::new(r"\A([ \f\t]|\\(\r\n?|\n))*").expect("regex");
-static NEWLINE_RE: Regex = Regex::new(r"\A(\r\n?|\n)").expect("regex");
-static COMMENT_RE: Regex = Regex::new(r"\A#[^\r\n]*").expect("regex");
-static NEWLINE_RE_2: Regex = Regex::new(r"\r\n?|\n").expect("regex");
+    static NEWLINE_RE: Regex = Regex::new(r"\A(\r\n?|\n)").expect("regex");
+    static COMMENT_RE: Regex = Regex::new(r"\A#[^\r\n]*").expect("regex");
 }
 
 #[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)]
@@ -74,12 +74,44 @@ impl<'a> Config<'a> {
                 break;
             }
         }
-        let default_newline =
-            NEWLINE_RE_2.with(|r| r.find(input).map(|m| m.as_str()).unwrap_or("\n"));
+
+        let mut lines = Vec::new();
+        let mut start = 0;
+        let mut newline_positions = memchr2_iter(b'\n', b'\r', input.as_bytes());
+
+        while let Some(newline_position) = newline_positions.next() {
+            let newline_character = input.as_bytes()[newline_position] as char;
+
+            let len = if newline_character == '\r'
+                && input.as_bytes().get(newline_position + 1) == Some(&b'\n')
+            {
+                // Skip the next '\n'
+                newline_positions.next();
+                2
+            } else {
+                1
+            };
+
+            let end = newline_position + len;
+            lines.push(&input[start..end]);
+            start = end;
+        }
+
+        // Push the last line if it isn't terminated by a newline character
+        if start < input.len() {
+            lines.push(&input[start..]);
+        }
+
+        let default_newline = match lines.first().map(|line| line.as_bytes()).unwrap_or(&[]) {
+            [.., b'\r', b'\n'] => "\r\n",
+            [.., b'\n'] => "\n",
+            [.., b'\r'] => "\r",
+            _ => "\n",
+        };
 
         Self {
             input,
-            lines: input.split_inclusive(default_newline).collect(),
+            lines,
             default_newline,
             default_indent,
         }
@@ -401,3 +433,23 @@ pub fn parse_parenthesized_whitespace<'a>(
         Ok(None)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{tokenize, Config, Result};
+
+    #[test]
+    fn config_mixed_newlines() -> Result<'static, ()> {
+        let source = "'' % {\n'test1': '',\r  'test2': '',\r\n}";
+        let tokens = tokenize(source)?;
+
+        let config = Config::new(source, &tokens);
+
+        assert_eq!(
+            &config.lines,
+            &["'' % {\n", "'test1': '',\r", "  'test2': '',\r\n", "}"]
+        );
+
+        Ok(())
+    }
+}
diff --git a/native/libcst/tests/.gitattributes b/native/libcst/tests/.gitattributes
@@ -0,0 +1 @@
+fixtures/mixed_newlines.py autocrlf=false
diff --git a/native/libcst/tests/fixtures/mixed_newlines.py b/native/libcst/tests/fixtures/mixed_newlines.py
@@ -0,0 +1,3 @@
+"" % {
+  'test1': '',  'test2': '',
+}