From 9c50c5ff6a5efa2d78039d2a404e1c7d7eb2084c Mon Sep 17 00:00:00 2001 From: Nikolai Kummer <32908738+nikolai-kummer@users.noreply.github.com> Date: Wed, 24 Apr 2024 06:39:10 -0600 Subject: [PATCH] Add docstring to create_template() function and make it faster (#90) * Update create_template with documentation and change the function to use list comprehension to make it faster * Add unit tests for the create_template function * Add slightly faster improvement as local variable is faster than instance varable access * Update docstring and move length assertion to top of function. removed the param_str variable as it acts like a penalty to function executions where the param_str is not accessed at all. Signed-off-by: Nikolai Kummer --------- Signed-off-by: Nikolai Kummer Co-authored-by: Superskyyy --- drain3/drain.py | 16 +++++++++------- tests/test_drain.py | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drain3/drain.py b/drain3/drain.py index 54a5b70..196aea8 100644 --- a/drain3/drain.py +++ b/drain3/drain.py @@ -413,14 +413,16 @@ def get_seq_distance(self, seq1: Sequence[str], seq2: Sequence[str], include_par return ret_val, param_count def create_template(self, seq1: Sequence[str], seq2: Sequence[str]) -> Sequence[str]: + """ + Loop through two sequences and create a template sequence that + replaces unmatched tokens with the parameter string. + + :param seq1: first sequence + :param seq2: second sequence + :return: template sequence with param_str in place of unmatched tokens + """ assert len(seq1) == len(seq2) - ret_val = list(seq2) - - for i, (token1, token2) in enumerate(zip(seq1, seq2)): - if token1 != token2: - ret_val[i] = self.param_str - - return ret_val + return [token2 if token1 == token2 else self.param_str for token1, token2 in zip(seq1, seq2)] def match(self, content: str, full_search_strategy: str = "never") -> Optional[LogCluster]: """ diff --git a/tests/test_drain.py b/tests/test_drain.py index 6d021f1..cd5d05d 100644 --- a/tests/test_drain.py +++ b/tests/test_drain.py @@ -257,3 +257,18 @@ def test_match_only(self): c: LogCluster = model.match("nothing") self.assertIsNone(c) + def test_create_template(self): + model = Drain(param_str="*") + + seq1 = ["aa", "bb", "dd"] + seq2 = ["aa", "bb", "cc"] + + # test for proper functionality + template = model.create_template(seq1, seq2) + self.assertListEqual(["aa", "bb", "*"], template) + + template = model.create_template(seq1, seq1) + self.assertListEqual(seq1, template) + + # Test for equal lengths input vectors + self.assertRaises(AssertionError, model.create_template, seq1, ["aa"]) \ No newline at end of file