Improve whitespace regex detection (#1589)

Previously, splitting a terminal rule into separate "spaces/tabs" and "newlines" terminals would cause the "newlines" one to be detected as a comment when generating TextMate grammar
eclipse-langium · Jul 24, 2024 · 3b007c2 · 3b007c2
1 parent 3e62f1a
commit 3b007c2
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 3 deletions.
diff --git a/packages/langium/src/utils/grammar-utils.ts b/packages/langium/src/utils/grammar-utils.ts
@@ -10,7 +10,7 @@ import type { AstNode, CstNode } from '../syntax-tree.js';
 import { isCompositeCstNode } from '../syntax-tree.js';
 import { getContainerOfType, streamAllContents } from './ast-utils.js';
 import { streamCst } from './cst-utils.js';
-import { escapeRegExp } from './regexp-utils.js';
+import { escapeRegExp, isWhitespace } from './regexp-utils.js';
 
 /**
  * Returns the entry rule of the given grammar, if any. If the grammar file does not contain an entry rule,
@@ -92,7 +92,7 @@ export function getCrossReferenceTerminal(crossRef: ast.CrossReference): ast.Abs
  * that contains visible characters is considered a comment.
  */
 export function isCommentTerminal(terminalRule: ast.TerminalRule): boolean {
-    return terminalRule.hidden && !terminalRegex(terminalRule).test(' ');
+    return terminalRule.hidden && !isWhitespace(terminalRegex(terminalRule));
 }
 
 /**

diff --git a/packages/langium/src/utils/regexp-utils.ts b/packages/langium/src/utils/regexp-utils.ts
@@ -138,9 +138,17 @@ export function isMultilineComment(regexp: RegExp | string): boolean {
     }
 }
 
+/**
+ * A set of all characters that are considered whitespace by the '\s' RegExp character class.
+ * Taken from [MDN](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes).
+ */
+export const whitespaceCharacters = (
+    '\f\n\r\t\v\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007' +
+    '\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff').split('');
+
 export function isWhitespace(value: RegExp | string): boolean {
     const regexp = typeof value === 'string' ? new RegExp(value) : value;
-    return regexp.test(' ');
+    return whitespaceCharacters.some((ws) => regexp.test(ws));
 }
 
 export function escapeRegExp(value: string): string {