Skip to content

Commit

Permalink
Support files with mixed newlines (#1007)
Browse files Browse the repository at this point in the history
* Add test case with mixed newlines

* Split lines by any newline character and not just by default

* Add unit test, remove copied
  • Loading branch information
MichaReiser authored Sep 2, 2023
1 parent 9286446 commit 9c263aa
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 6 deletions.
1 change: 1 addition & 0 deletions native/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions native/libcst/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ thiserror = "1.0.37"
peg = "0.8.1"
chic = "1.2.2"
regex = "1.9.3"
memchr = "2.5.0"
libcst_derive = { path = "../libcst_derive" }

[dev-dependencies]
Expand Down
64 changes: 58 additions & 6 deletions native/libcst/src/tokenizer/whitespace_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::nodes::{
Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace,
SimpleWhitespace, TrailingWhitespace,
};
use memchr::memchr2_iter;
use regex::Regex;
use thiserror::Error;

Expand All @@ -16,9 +17,8 @@ use super::TokType;

thread_local! {
static SIMPLE_WHITESPACE_RE: Regex = Regex::new(r"\A([ \f\t]|\\(\r\n?|\n))*").expect("regex");
static NEWLINE_RE: Regex = Regex::new(r"\A(\r\n?|\n)").expect("regex");
static COMMENT_RE: Regex = Regex::new(r"\A#[^\r\n]*").expect("regex");
static NEWLINE_RE_2: Regex = Regex::new(r"\r\n?|\n").expect("regex");
static NEWLINE_RE: Regex = Regex::new(r"\A(\r\n?|\n)").expect("regex");
static COMMENT_RE: Regex = Regex::new(r"\A#[^\r\n]*").expect("regex");
}

#[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)]
Expand Down Expand Up @@ -74,12 +74,44 @@ impl<'a> Config<'a> {
break;
}
}
let default_newline =
NEWLINE_RE_2.with(|r| r.find(input).map(|m| m.as_str()).unwrap_or("\n"));

let mut lines = Vec::new();
let mut start = 0;
let mut newline_positions = memchr2_iter(b'\n', b'\r', input.as_bytes());

while let Some(newline_position) = newline_positions.next() {
let newline_character = input.as_bytes()[newline_position] as char;

let len = if newline_character == '\r'
&& input.as_bytes().get(newline_position + 1) == Some(&b'\n')
{
// Skip the next '\n'
newline_positions.next();
2
} else {
1
};

let end = newline_position + len;
lines.push(&input[start..end]);
start = end;
}

// Push the last line if it isn't terminated by a newline character
if start < input.len() {
lines.push(&input[start..]);
}

let default_newline = match lines.first().map(|line| line.as_bytes()).unwrap_or(&[]) {
[.., b'\r', b'\n'] => "\r\n",
[.., b'\n'] => "\n",
[.., b'\r'] => "\r",
_ => "\n",
};

Self {
input,
lines: input.split_inclusive(default_newline).collect(),
lines,
default_newline,
default_indent,
}
Expand Down Expand Up @@ -401,3 +433,23 @@ pub fn parse_parenthesized_whitespace<'a>(
Ok(None)
}
}

#[cfg(test)]
mod tests {
use crate::{tokenize, Config, Result};

#[test]
fn config_mixed_newlines() -> Result<'static, ()> {
let source = "'' % {\n'test1': '',\r 'test2': '',\r\n}";
let tokens = tokenize(source)?;

let config = Config::new(source, &tokens);

assert_eq!(
&config.lines,
&["'' % {\n", "'test1': '',\r", " 'test2': '',\r\n", "}"]
);

Ok(())
}
}
1 change: 1 addition & 0 deletions native/libcst/tests/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
fixtures/mixed_newlines.py autocrlf=false
3 changes: 3 additions & 0 deletions native/libcst/tests/fixtures/mixed_newlines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"" % {
'test1': '', 'test2': '',
}
Expand Down

0 comments on commit 9c263aa

Please sign in to comment.