Switch to using thread_local regular expressions to avoid regex mutex…

… contention
Instagram · Aug 25, 2023 · f215978 · f215978
1 parent f0347f0
commit f215978
Show file tree

Hide file tree

Showing 8 changed files with 446 additions and 405 deletions.
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/libcst/Cargo.toml b/native/libcst/Cargo.toml
@@ -34,17 +34,17 @@ pyo3 = { version = ">=0.17", optional = true }
 thiserror = "1.0.37"
 peg = "0.8.1"
 chic = "1.2.2"
-itertools = "0.10.5"
-once_cell = "1.16.0"
-regex = "1.7.0"
+regex = "1.9.3"
 libcst_derive = { path = "../libcst_derive" }
 
 [dev-dependencies]
-criterion = { version = "0.4.0", features = ["html_reports"] }
+criterion = { version = "0.5.1", features = ["html_reports"] }
 difference = "2.0.0"
+rayon = "1.7.0"
+itertools = "0.11.0"
 
 [target.'cfg(target_arch = "x86_64")'.dev-dependencies]
-criterion-cycles-per-byte = "0.1"
+criterion-cycles-per-byte = "0.5.0"
 
 [[bench]]
 name = "parser_benchmark"

diff --git a/native/libcst/benches/parser_benchmark.rs b/native/libcst/benches/parser_benchmark.rs
@@ -8,22 +8,22 @@ use std::{
     time::Duration,
 };
 
-use criterion::{
-    black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, Criterion,
-};
+use criterion::{BatchSize, BenchmarkId, black_box, Criterion, criterion_group, criterion_main, measurement::Measurement, Throughput};
+use itertools::Itertools;
+use rayon::prelude::*;
+
 #[cfg(target_arch = "x86_64")]
 use criterion_cycles_per_byte::CyclesPerByte;
-use itertools::Itertools;
 use libcst_native::{
-    parse_module, parse_tokens_without_whitespace, tokenize, Codegen, Config, Inflate,
+    Codegen, Config, Inflate, parse_module, parse_tokens_without_whitespace, tokenize,
 };
 
 #[cfg(not(windows))]
 const NEWLINE: &str = "\n";
 #[cfg(windows)]
 const NEWLINE: &str = "\r\n";
 
-fn load_all_fixtures() -> String {
+fn load_all_fixtures_vec() -> Vec<String> {
     let mut path = PathBuf::from(file!());
     path.pop();
     path.pop();
@@ -44,7 +44,11 @@ fn load_all_fixtures() -> String {
             let path = file.unwrap().path();
             std::fs::read_to_string(&path).expect("reading_file")
         })
-        .join(NEWLINE)
+        .collect()
+}
+
+fn load_all_fixtures() -> String {
+    load_all_fixtures_vec().join(NEWLINE)
 }
 
 pub fn inflate_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
@@ -119,6 +123,33 @@ pub fn parse_into_cst_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
     group.finish();
 }
 
+pub fn parse_into_cst_multithreaded_benchmarks<T: Measurement + std::marker::Sync>(c: &mut Criterion<T>) where <T as Measurement>::Value: Send {
+    let fixtures = load_all_fixtures_vec();
+    let mut group = c.benchmark_group("parse_into_cst_parallel");
+    group.measurement_time(Duration::from_secs(15));
+    group.warm_up_time(Duration::from_secs(5));
+
+    for thread_count in 1..10 {
+        let expanded_fixtures = (0..thread_count).flat_map(|_| fixtures.clone()).collect_vec();
+        group.throughput(Throughput::Elements(expanded_fixtures.len() as u64));
+        group.bench_with_input(BenchmarkId::from_parameter(thread_count), &thread_count, |b, thread_count| {
+            let thread_pool = rayon::ThreadPoolBuilder::new()
+                .num_threads(*thread_count).build().unwrap();
+            thread_pool.install(|| {
+                b.iter_with_large_drop(
+                    || {
+                        expanded_fixtures.par_iter()
+                            .map(|contents| black_box(parse_module(&contents, None)))
+                            .collect::<Vec<_>>()
+                    },
+                );
+            });
+        });
+    }
+
+    group.finish();
+}
+
 #[cfg(target_arch = "x86_64")]
 fn get_config() -> Criterion {
     // criterion_cycles_per_byte is only supported on x86
@@ -133,6 +164,6 @@ fn get_config() -> Criterion {
 criterion_group!(
     name=benches;
     config=get_config();
-    targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks
+    targets=parse_into_cst_multithreaded_benchmarks
 );
 criterion_main!(benches);
diff --git a/native/libcst/src/parser/numbers.rs b/native/libcst/src/parser/numbers.rs
@@ -3,7 +3,6 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 
 use crate::nodes::deflated::{Expression, Float, Imaginary, Integer};
@@ -13,51 +12,48 @@ static BIN: &str = r"0[bB](?:_?[01])+";
 static OCT: &str = r"0[oO](?:_?[0-7])+";
 static DECIMAL: &str = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)";
 
-static INTEGER_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex")
-});
-
 static EXPONENT: &str = r"[eE][-+]?[0-9](?:_?[0-9])*";
 // Note: these don't exactly match the python implementation (exponent is not included)
 static POINT_FLOAT: &str = r"([0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?|\.[0-9](?:_?[0-9])*)";
 static EXP_FLOAT: &str = r"[0-9](?:_?[0-9])*";
 
-static FLOAT_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(
-        format!(
-            "^({}({})?|{}{})$",
-            POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+thread_local! {
+    static INTEGER_RE: Regex =
+        Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex");
+    static FLOAT_RE: Regex =
+        Regex::new(
+            format!(
+                "^({}({})?|{}{})$",
+                POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+            )
+            .as_str(),
         )
-        .as_str(),
-    )
-    .expect("regex")
-});
-
-static IMAGINARY_RE: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(
-        format!(
-            r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
-            POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+        .expect("regex");
+    static IMAGINARY_RE: Regex =
+        Regex::new(
+            format!(
+                r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
+                POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
+            )
+            .as_str(),
         )
-        .as_str(),
-    )
-    .expect("regex")
-});
+        .expect("regex");
+}
 
 pub(crate) fn parse_number(raw: &str) -> Expression {
-    if INTEGER_RE.is_match(raw) {
+    if INTEGER_RE.with(|r| r.is_match(raw)) {
         Expression::Integer(Box::new(Integer {
             value: raw,
             lpar: Default::default(),
             rpar: Default::default(),
         }))
-    } else if FLOAT_RE.is_match(raw) {
+    } else if FLOAT_RE.with(|r| r.is_match(raw)) {
         Expression::Float(Box::new(Float {
             value: raw,
             lpar: Default::default(),
             rpar: Default::default(),
         }))
-    } else if IMAGINARY_RE.is_match(raw) {
+    } else if IMAGINARY_RE.with(|r| r.is_match(raw)) {
         Expression::Imaginary(Box::new(Imaginary {
             value: raw,
             lpar: Default::default(),

diff --git a/native/libcst/src/tokenizer/core/mod.rs b/native/libcst/src/tokenizer/core/mod.rs
@@ -58,7 +58,6 @@
 /// [RustPython's parser]: https://crates.io/crates/rustpython-parser
 mod string_types;
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 use std::cell::RefCell;
 use std::cmp::Ordering;
@@ -83,25 +82,27 @@ const MAX_INDENT: usize = 100;
 // https://github.com/rust-lang/rust/issues/71763
 const MAX_CHAR: char = '\u{10ffff}';
 
-static SPACE_TAB_FORMFEED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[ \f\t]+").expect("regex"));
-static ANY_NON_NEWLINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[^\r\n]+").expect("regex"));
-static STRING_PREFIX_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex"));
-static POTENTIAL_IDENTIFIER_TAIL_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex"));
-static DECIMAL_DOT_DIGIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A\.[0-9]").expect("regex"));
-static DECIMAL_TAIL_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A[0-9](_?[0-9])*").expect("regex"));
-static HEXADECIMAL_TAIL_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex"));
-static OCTAL_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[0-7])+").expect("regex"));
-static BINARY_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[01])+").expect("regex"));
-
-/// Used to verify identifiers when there's a non-ascii character in them.
-// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
-// given Python version's behavior.
-static UNICODE_IDENTIFIER_RE: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex"));
+thread_local! {
+    static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
+    static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
+    static STRING_PREFIX_RE: Regex =
+        Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex");
+    static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
+        Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
+    static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
+    static DECIMAL_TAIL_RE: Regex =
+        Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
+    static HEXADECIMAL_TAIL_RE: Regex =
+        Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
+    static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
+    static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");
+
+    /// Used to verify identifiers when there's a non-ascii character in them.
+    // This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
+    // given Python version's behavior.
+    static UNICODE_IDENTIFIER_RE: Regex =
+        Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
+}
 
 #[derive(Debug, Eq, PartialEq, Copy, Clone)]
 pub enum TokType {
@@ -316,11 +317,11 @@ impl<'t> TokState<'t> {
 
         'again: loop {
             // Skip spaces
-            self.text_pos.consume(&*SPACE_TAB_FORMFEED_RE);
+            SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));
 
             // Skip comment, unless it's a type comment
             if self.text_pos.peek() == Some('#') {
-                self.text_pos.consume(&*ANY_NON_NEWLINE_RE);
+                ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
                 // type_comment is not supported
             }
 
@@ -384,7 +385,7 @@ impl<'t> TokState<'t> {
                 }
 
                 // Number starting with period
-                Some('.') if self.text_pos.matches(&*DECIMAL_DOT_DIGIT_RE) => {
+                Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
                     self.consume_number(NumberState::Fraction)
                 }
 
@@ -472,7 +473,7 @@ impl<'t> TokState<'t> {
                 }
 
                 // Operator
-                Some(_) if self.text_pos.consume(&*OPERATOR_RE) => Ok(TokType::Op),
+                Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),
 
                 // Bad character
                 // If nothing works, fall back to this error. CPython returns an OP in this case,
@@ -623,7 +624,7 @@ impl<'t> TokState<'t> {
 
     fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
         // Process the various legal combinations of b"", r"", u"", and f"".
-        if self.text_pos.consume(&*STRING_PREFIX_RE) {
+        if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
             if let Some('"') | Some('\'') = self.text_pos.peek() {
                 // We found a string, not an identifier. Bail!
                 if self.split_fstring
@@ -645,7 +646,7 @@ impl<'t> TokState<'t> {
                 Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
             ));
         }
-        self.text_pos.consume(&*POTENTIAL_IDENTIFIER_TAIL_RE);
+        POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
         let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
         if !verify_identifier(identifier_str) {
             // TODO: async/await
@@ -691,7 +692,7 @@ impl<'t> TokState<'t> {
                     match self.text_pos.peek() {
                         Some('x') | Some('X') => {
                             self.text_pos.next();
-                            if !self.text_pos.consume(&*HEXADECIMAL_TAIL_RE)
+                            if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
                                 || self.text_pos.peek() == Some('_')
                             {
                                 Err(TokError::BadHexadecimal)
@@ -701,7 +702,7 @@ impl<'t> TokState<'t> {
                         }
                         Some('o') | Some('O') => {
                             self.text_pos.next();
-                            if !self.text_pos.consume(&*OCTAL_TAIL_RE)
+                            if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
                                 || self.text_pos.peek() == Some('_')
                             {
                                 return Err(TokError::BadOctal);
@@ -715,7 +716,7 @@ impl<'t> TokState<'t> {
                         }
                         Some('b') | Some('B') => {
                             self.text_pos.next();
-                            if !self.text_pos.consume(&*BINARY_TAIL_RE)
+                            if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
                                 || self.text_pos.peek() == Some('_')
                             {
                                 return Err(TokError::BadBinary);
@@ -819,7 +820,7 @@ impl<'t> TokState<'t> {
 
     /// Processes a decimal tail. This is the bit after the dot or after an E in a float.
     fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
-        let result = self.text_pos.consume(&*DECIMAL_TAIL_RE);
+        let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
         // Assumption: If we've been called, the first character is an integer, so we must have a
         // regex match
         debug_assert!(result, "try_decimal_tail was called on a non-digit char");
@@ -1058,7 +1059,7 @@ fn verify_identifier(name: &str) -> bool {
     // TODO: If `name` is non-ascii, must first normalize name to NFKC.
     // Common case: If the entire string is ascii, we can avoid the more expensive regex check,
     // since the tokenizer already validates ascii characters before calling us.
-    name.is_ascii() || UNICODE_IDENTIFIER_RE.is_match(name)
+    name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
 }
 
 #[derive(Clone)]

diff --git a/native/libcst/src/tokenizer/operators.rs b/native/libcst/src/tokenizer/operators.rs
@@ -8,7 +8,6 @@
 // code or that we retain the original work's copyright information.
 // https://docs.python.org/3/license.html#zero-clause-bsd-license-for-code-in-the-python-release-documentation
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 
 /// A list of strings that make up all the possible operators in a specific version of Python.
@@ -69,7 +68,8 @@ pub const OPERATORS: &[&str] = &[
     "<>",
 ];
 
-pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
+thread_local! {
+pub static OPERATOR_RE: Regex = {
     // sort operators so that we try to match the longest ones first
     let mut sorted_operators: Box<[&str]> = OPERATORS.into();
     sorted_operators.sort_unstable_by_key(|op| usize::MAX - op.len());
@@ -82,4 +82,5 @@ pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
             .join("|")
     ))
     .expect("regex")
-});
+};
+}
diff --git a/native/libcst/src/tokenizer/text_position/mod.rs b/native/libcst/src/tokenizer/text_position/mod.rs
@@ -5,14 +5,15 @@
 
 mod char_width;
 
-use once_cell::sync::Lazy;
 use regex::Regex;
 use std::fmt;
 
 use crate::tokenizer::debug_utils::EllipsisDebug;
 use char_width::NewlineNormalizedCharWidths;
 
-static CR_OR_LF_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\r\n]").expect("regex"));
+thread_local! {
+    static CR_OR_LF_RE: Regex = Regex::new(r"[\r\n]").expect("regex");
+}
 
 pub trait TextPattern {
     fn match_len(&self, text: &str) -> Option<usize>;
@@ -98,7 +99,7 @@ impl<'t> TextPosition<'t> {
         match match_len {
             Some(match_len) => {
                 assert!(
-                    !CR_OR_LF_RE.is_match(&rest_of_text[..match_len]),
+                    !CR_OR_LF_RE.with(|r| r.is_match(&rest_of_text[..match_len])),
                     "matches pattern must not match a newline",
                 );
                 true