Skip to content

Commit

Permalink
Improve whitespace regex detection (#1589)
Browse files Browse the repository at this point in the history
Previously, splitting a terminal rule into separate "spaces/tabs"
and "newlines" terminals would cause the "newlines" one to be detected
as a comment when generating TextMate grammar
  • Loading branch information
aabounegm authored Jul 24, 2024
1 parent 3e62f1a commit 3b007c2
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
4 changes: 2 additions & 2 deletions packages/langium/src/utils/grammar-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import type { AstNode, CstNode } from '../syntax-tree.js';
import { isCompositeCstNode } from '../syntax-tree.js';
import { getContainerOfType, streamAllContents } from './ast-utils.js';
import { streamCst } from './cst-utils.js';
import { escapeRegExp } from './regexp-utils.js';
import { escapeRegExp, isWhitespace } from './regexp-utils.js';

/**
* Returns the entry rule of the given grammar, if any. If the grammar file does not contain an entry rule,
Expand Down Expand Up @@ -92,7 +92,7 @@ export function getCrossReferenceTerminal(crossRef: ast.CrossReference): ast.Abs
* that contains visible characters is considered a comment.
*/
export function isCommentTerminal(terminalRule: ast.TerminalRule): boolean {
return terminalRule.hidden && !terminalRegex(terminalRule).test(' ');
return terminalRule.hidden && !isWhitespace(terminalRegex(terminalRule));
}

/**
Expand Down
10 changes: 9 additions & 1 deletion packages/langium/src/utils/regexp-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,17 @@ export function isMultilineComment(regexp: RegExp | string): boolean {
}
}

/**
* A set of all characters that are considered whitespace by the '\s' RegExp character class.
* Taken from [MDN](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes).
*/
export const whitespaceCharacters = (
'\f\n\r\t\v\u0020\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007' +
'\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff').split('');

export function isWhitespace(value: RegExp | string): boolean {
const regexp = typeof value === 'string' ? new RegExp(value) : value;
return regexp.test(' ');
return whitespaceCharacters.some((ws) => regexp.test(ws));
}

export function escapeRegExp(value: string): string {
Expand Down

0 comments on commit 3b007c2

Please sign in to comment.