Skip to content

Commit

Permalink
Switch to using thread_local regular expressions to avoid regex mutex…
Browse files Browse the repository at this point in the history
… contention
  • Loading branch information
orf committed Aug 25, 2023
1 parent f0347f0 commit f215978
Show file tree
Hide file tree
Showing 8 changed files with 446 additions and 405 deletions.
630 changes: 322 additions & 308 deletions native/Cargo.lock

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions native/libcst/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,17 @@ pyo3 = { version = ">=0.17", optional = true }
thiserror = "1.0.37"
peg = "0.8.1"
chic = "1.2.2"
itertools = "0.10.5"
once_cell = "1.16.0"
regex = "1.7.0"
regex = "1.9.3"
libcst_derive = { path = "../libcst_derive" }

[dev-dependencies]
criterion = { version = "0.4.0", features = ["html_reports"] }
criterion = { version = "0.5.1", features = ["html_reports"] }
difference = "2.0.0"
rayon = "1.7.0"
itertools = "0.11.0"

[target.'cfg(target_arch = "x86_64")'.dev-dependencies]
criterion-cycles-per-byte = "0.1"
criterion-cycles-per-byte = "0.5.0"

[[bench]]
name = "parser_benchmark"
Expand Down
47 changes: 39 additions & 8 deletions native/libcst/benches/parser_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ use std::{
time::Duration,
};

use criterion::{
black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, Criterion,
};
use criterion::{BatchSize, BenchmarkId, black_box, Criterion, criterion_group, criterion_main, measurement::Measurement, Throughput};
use itertools::Itertools;
use rayon::prelude::*;

#[cfg(target_arch = "x86_64")]
use criterion_cycles_per_byte::CyclesPerByte;
use itertools::Itertools;
use libcst_native::{
parse_module, parse_tokens_without_whitespace, tokenize, Codegen, Config, Inflate,
Codegen, Config, Inflate, parse_module, parse_tokens_without_whitespace, tokenize,
};

#[cfg(not(windows))]
const NEWLINE: &str = "\n";
#[cfg(windows)]
const NEWLINE: &str = "\r\n";

fn load_all_fixtures() -> String {
fn load_all_fixtures_vec() -> Vec<String> {
let mut path = PathBuf::from(file!());
path.pop();
path.pop();
Expand All @@ -44,7 +44,11 @@ fn load_all_fixtures() -> String {
let path = file.unwrap().path();
std::fs::read_to_string(&path).expect("reading_file")
})
.join(NEWLINE)
.collect()
}

fn load_all_fixtures() -> String {
load_all_fixtures_vec().join(NEWLINE)
}

pub fn inflate_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
Expand Down Expand Up @@ -119,6 +123,33 @@ pub fn parse_into_cst_benchmarks<T: Measurement>(c: &mut Criterion<T>) {
group.finish();
}

pub fn parse_into_cst_multithreaded_benchmarks<T: Measurement + std::marker::Sync>(c: &mut Criterion<T>) where <T as Measurement>::Value: Send {
let fixtures = load_all_fixtures_vec();
let mut group = c.benchmark_group("parse_into_cst_parallel");
group.measurement_time(Duration::from_secs(15));
group.warm_up_time(Duration::from_secs(5));

for thread_count in 1..10 {
let expanded_fixtures = (0..thread_count).flat_map(|_| fixtures.clone()).collect_vec();
group.throughput(Throughput::Elements(expanded_fixtures.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(thread_count), &thread_count, |b, thread_count| {
let thread_pool = rayon::ThreadPoolBuilder::new()
.num_threads(*thread_count).build().unwrap();
thread_pool.install(|| {
b.iter_with_large_drop(
|| {
expanded_fixtures.par_iter()
.map(|contents| black_box(parse_module(&contents, None)))
.collect::<Vec<_>>()
},
);
});
});
}

group.finish();
}

#[cfg(target_arch = "x86_64")]
fn get_config() -> Criterion {
// criterion_cycles_per_byte is only supported on x86
Expand All @@ -133,6 +164,6 @@ fn get_config() -> Criterion {
criterion_group!(
name=benches;
config=get_config();
targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks, parse_into_cst_benchmarks
targets=parse_into_cst_multithreaded_benchmarks
);
criterion_main!(benches);
50 changes: 23 additions & 27 deletions native/libcst/src/parser/numbers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree

use once_cell::sync::Lazy;
use regex::Regex;

use crate::nodes::deflated::{Expression, Float, Imaginary, Integer};
Expand All @@ -13,51 +12,48 @@ static BIN: &str = r"0[bB](?:_?[01])+";
static OCT: &str = r"0[oO](?:_?[0-7])+";
static DECIMAL: &str = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)";

static INTEGER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex")
});

static EXPONENT: &str = r"[eE][-+]?[0-9](?:_?[0-9])*";
// Note: these don't exactly match the python implementation (exponent is not included)
static POINT_FLOAT: &str = r"([0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?|\.[0-9](?:_?[0-9])*)";
static EXP_FLOAT: &str = r"[0-9](?:_?[0-9])*";

static FLOAT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
format!(
"^({}({})?|{}{})$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
thread_local! {
static INTEGER_RE: Regex =
Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex");
static FLOAT_RE: Regex =
Regex::new(
format!(
"^({}({})?|{}{})$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
)
.as_str(),
)
.as_str(),
)
.expect("regex")
});

static IMAGINARY_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
format!(
r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
.expect("regex");
static IMAGINARY_RE: Regex =
Regex::new(
format!(
r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$",
POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT
)
.as_str(),
)
.as_str(),
)
.expect("regex")
});
.expect("regex");
}

pub(crate) fn parse_number(raw: &str) -> Expression {
if INTEGER_RE.is_match(raw) {
if INTEGER_RE.with(|r| r.is_match(raw)) {
Expression::Integer(Box::new(Integer {
value: raw,
lpar: Default::default(),
rpar: Default::default(),
}))
} else if FLOAT_RE.is_match(raw) {
} else if FLOAT_RE.with(|r| r.is_match(raw)) {
Expression::Float(Box::new(Float {
value: raw,
lpar: Default::default(),
rpar: Default::default(),
}))
} else if IMAGINARY_RE.is_match(raw) {
} else if IMAGINARY_RE.with(|r| r.is_match(raw)) {
Expression::Imaginary(Box::new(Imaginary {
value: raw,
lpar: Default::default(),
Expand Down
63 changes: 32 additions & 31 deletions native/libcst/src/tokenizer/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
/// [RustPython's parser]: https://crates.io/crates/rustpython-parser
mod string_types;

use once_cell::sync::Lazy;
use regex::Regex;
use std::cell::RefCell;
use std::cmp::Ordering;
Expand All @@ -83,25 +82,27 @@ const MAX_INDENT: usize = 100;
// https://github.com/rust-lang/rust/issues/71763
const MAX_CHAR: char = '\u{10ffff}';

static SPACE_TAB_FORMFEED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[ \f\t]+").expect("regex"));
static ANY_NON_NEWLINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A[^\r\n]+").expect("regex"));
static STRING_PREFIX_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex"));
static POTENTIAL_IDENTIFIER_TAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex"));
static DECIMAL_DOT_DIGIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A\.[0-9]").expect("regex"));
static DECIMAL_TAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A[0-9](_?[0-9])*").expect("regex"));
static HEXADECIMAL_TAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex"));
static OCTAL_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[0-7])+").expect("regex"));
static BINARY_TAIL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\A(_?[01])+").expect("regex"));

/// Used to verify identifiers when there's a non-ascii character in them.
// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
// given Python version's behavior.
static UNICODE_IDENTIFIER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex"));
thread_local! {
static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
static STRING_PREFIX_RE: Regex =
Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex");
static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
static DECIMAL_TAIL_RE: Regex =
Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
static HEXADECIMAL_TAIL_RE: Regex =
Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");

/// Used to verify identifiers when there's a non-ascii character in them.
// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a
// given Python version's behavior.
static UNICODE_IDENTIFIER_RE: Regex =
Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
}

#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub enum TokType {
Expand Down Expand Up @@ -316,11 +317,11 @@ impl<'t> TokState<'t> {

'again: loop {
// Skip spaces
self.text_pos.consume(&*SPACE_TAB_FORMFEED_RE);
SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));

// Skip comment, unless it's a type comment
if self.text_pos.peek() == Some('#') {
self.text_pos.consume(&*ANY_NON_NEWLINE_RE);
ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
// type_comment is not supported
}

Expand Down Expand Up @@ -384,7 +385,7 @@ impl<'t> TokState<'t> {
}

// Number starting with period
Some('.') if self.text_pos.matches(&*DECIMAL_DOT_DIGIT_RE) => {
Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
self.consume_number(NumberState::Fraction)
}

Expand Down Expand Up @@ -472,7 +473,7 @@ impl<'t> TokState<'t> {
}

// Operator
Some(_) if self.text_pos.consume(&*OPERATOR_RE) => Ok(TokType::Op),
Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),

// Bad character
// If nothing works, fall back to this error. CPython returns an OP in this case,
Expand Down Expand Up @@ -623,7 +624,7 @@ impl<'t> TokState<'t> {

fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
// Process the various legal combinations of b"", r"", u"", and f"".
if self.text_pos.consume(&*STRING_PREFIX_RE) {
if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
if let Some('"') | Some('\'') = self.text_pos.peek() {
// We found a string, not an identifier. Bail!
if self.split_fstring
Expand All @@ -645,7 +646,7 @@ impl<'t> TokState<'t> {
Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
));
}
self.text_pos.consume(&*POTENTIAL_IDENTIFIER_TAIL_RE);
POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
if !verify_identifier(identifier_str) {
// TODO: async/await
Expand Down Expand Up @@ -691,7 +692,7 @@ impl<'t> TokState<'t> {
match self.text_pos.peek() {
Some('x') | Some('X') => {
self.text_pos.next();
if !self.text_pos.consume(&*HEXADECIMAL_TAIL_RE)
if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
Err(TokError::BadHexadecimal)
Expand All @@ -701,7 +702,7 @@ impl<'t> TokState<'t> {
}
Some('o') | Some('O') => {
self.text_pos.next();
if !self.text_pos.consume(&*OCTAL_TAIL_RE)
if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
return Err(TokError::BadOctal);
Expand All @@ -715,7 +716,7 @@ impl<'t> TokState<'t> {
}
Some('b') | Some('B') => {
self.text_pos.next();
if !self.text_pos.consume(&*BINARY_TAIL_RE)
if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
return Err(TokError::BadBinary);
Expand Down Expand Up @@ -819,7 +820,7 @@ impl<'t> TokState<'t> {

/// Processes a decimal tail. This is the bit after the dot or after an E in a float.
fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
let result = self.text_pos.consume(&*DECIMAL_TAIL_RE);
let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
// Assumption: If we've been called, the first character is an integer, so we must have a
// regex match
debug_assert!(result, "try_decimal_tail was called on a non-digit char");
Expand Down Expand Up @@ -1058,7 +1059,7 @@ fn verify_identifier(name: &str) -> bool {
// TODO: If `name` is non-ascii, must first normalize name to NFKC.
// Common case: If the entire string is ascii, we can avoid the more expensive regex check,
// since the tokenizer already validates ascii characters before calling us.
name.is_ascii() || UNICODE_IDENTIFIER_RE.is_match(name)
name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
}

#[derive(Clone)]
Expand Down
7 changes: 4 additions & 3 deletions native/libcst/src/tokenizer/operators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
// code or that we retain the original work's copyright information.
// https://docs.python.org/3/license.html#zero-clause-bsd-license-for-code-in-the-python-release-documentation

use once_cell::sync::Lazy;
use regex::Regex;

/// A list of strings that make up all the possible operators in a specific version of Python.
Expand Down Expand Up @@ -69,7 +68,8 @@ pub const OPERATORS: &[&str] = &[
"<>",
];

pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
thread_local! {
pub static OPERATOR_RE: Regex = {
// sort operators so that we try to match the longest ones first
let mut sorted_operators: Box<[&str]> = OPERATORS.into();
sorted_operators.sort_unstable_by_key(|op| usize::MAX - op.len());
Expand All @@ -82,4 +82,5 @@ pub static OPERATOR_RE: Lazy<Regex> = Lazy::new(|| {
.join("|")
))
.expect("regex")
});
};
}
7 changes: 4 additions & 3 deletions native/libcst/src/tokenizer/text_position/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@

mod char_width;

use once_cell::sync::Lazy;
use regex::Regex;
use std::fmt;

use crate::tokenizer::debug_utils::EllipsisDebug;
use char_width::NewlineNormalizedCharWidths;

static CR_OR_LF_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\r\n]").expect("regex"));
thread_local! {
static CR_OR_LF_RE: Regex = Regex::new(r"[\r\n]").expect("regex");
}

pub trait TextPattern {
fn match_len(&self, text: &str) -> Option<usize>;
Expand Down Expand Up @@ -98,7 +99,7 @@ impl<'t> TextPosition<'t> {
match match_len {
Some(match_len) => {
assert!(
!CR_OR_LF_RE.is_match(&rest_of_text[..match_len]),
!CR_OR_LF_RE.with(|r| r.is_match(&rest_of_text[..match_len])),
"matches pattern must not match a newline",
);
true
Expand Down
Loading

0 comments on commit f215978

Please sign in to comment.