Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lexical-markdown] Fix: normalize markdown in $convertFromMarkdownString to comply with CommonMark spec #6608

Merged
merged 11 commits into from
Sep 10, 2024
66 changes: 59 additions & 7 deletions packages/lexical-markdown/src/MarkdownTransformers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ export type TextMatchTransformer = Readonly<{
type: 'text-match';
}>;

const ORDERED_LIST_REGEX = /^(\s*)(\d{1,})\.\s/;
const UNORDERED_LIST_REGEX = /^(\s*)[-*+]\s/;
const CHECK_LIST_REGEX = /^(\s*)(?:-\s)?\s?(\[(\s|x)?\])\s/i;
const HEADING_REGEX = /^(#{1,6})\s/;
const QUOTE_REGEX = /^>\s/;
const CODE_START_REGEX = /^[ \t]*```(\w+)?/;
const CODE_END_REGEX = /[ \t]*```$/;

const createBlockNode = (
createNode: (match: Array<string>) => ElementNode,
): ElementTransformer['replace'] => {
Expand Down Expand Up @@ -266,7 +274,7 @@ export const HEADING: ElementTransformer = {
const level = Number(node.getTag().slice(1));
return '#'.repeat(level) + ' ' + exportChildren(node);
},
regExp: /^(#{1,6})\s/,
regExp: HEADING_REGEX,
replace: createBlockNode((match) => {
const tag = ('h' + match[1].length) as HeadingTagType;
return $createHeadingNode(tag);
Expand All @@ -288,7 +296,7 @@ export const QUOTE: ElementTransformer = {
}
return output.join('\n');
},
regExp: /^>\s/,
regExp: QUOTE_REGEX,
replace: (parentNode, children, _match, isImport) => {
if (isImport) {
const previousNode = parentNode.getPreviousSibling();
Expand Down Expand Up @@ -328,9 +336,9 @@ export const CODE: MultilineElementTransformer = {
},
regExpEnd: {
optional: true,
regExp: /[ \t]*```$/,
regExp: CODE_END_REGEX,
},
regExpStart: /^[ \t]*```(\w+)?/,
regExpStart: CODE_START_REGEX,
replace: (
rootNode,
children,
Expand Down Expand Up @@ -399,7 +407,7 @@ export const UNORDERED_LIST: ElementTransformer = {
export: (node, exportChildren) => {
return $isListNode(node) ? listExport(node, exportChildren, 0) : null;
},
regExp: /^(\s*)[-*+]\s/,
regExp: UNORDERED_LIST_REGEX,
replace: listReplace('bullet'),
type: 'element',
};
Expand All @@ -409,7 +417,7 @@ export const CHECK_LIST: ElementTransformer = {
export: (node, exportChildren) => {
return $isListNode(node) ? listExport(node, exportChildren, 0) : null;
},
regExp: /^(\s*)(?:-\s)?\s?(\[(\s|x)?\])\s/i,
regExp: CHECK_LIST_REGEX,
replace: listReplace('check'),
type: 'element',
};
Expand All @@ -419,7 +427,7 @@ export const ORDERED_LIST: ElementTransformer = {
export: (node, exportChildren) => {
return $isListNode(node) ? listExport(node, exportChildren, 0) : null;
},
regExp: /^(\s*)(\d{1,})\.\s/,
regExp: ORDERED_LIST_REGEX,
replace: listReplace('number'),
type: 'element',
};
Expand Down Expand Up @@ -519,3 +527,47 @@ export const LINK: TextMatchTransformer = {
trigger: ')',
type: 'text-match',
};

export function normalizeMarkdown(input: string): string {
const lines = input.split('\n');
let inCodeBlock = false;
const sanitizedLines: string[] = [];

for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const lastLine = sanitizedLines[sanitizedLines.length - 1];

// Detect the start or end of a code block
if (CODE_START_REGEX.test(line) || CODE_END_REGEX.test(line)) {
inCodeBlock = !inCodeBlock;
sanitizedLines.push(line);
continue;
}
GermanJablo marked this conversation as resolved.
Show resolved Hide resolved

// If we are inside a code block, keep the line unchanged
if (inCodeBlock) {
sanitizedLines.push(line);
continue;
}

// In markdown the concept of "empty paragraphs" does not exist.
// Blocks must be separated by an empty line. Non-empty adjacent lines must be merged.
if (
line === '' ||
lastLine === '' ||
!lastLine ||
HEADING_REGEX.test(lastLine) ||
HEADING_REGEX.test(line) ||
QUOTE_REGEX.test(line) ||
ORDERED_LIST_REGEX.test(line) ||
UNORDERED_LIST_REGEX.test(line) ||
CHECK_LIST_REGEX.test(line)
) {
sanitizedLines.push(line);
} else {
sanitizedLines[sanitizedLines.length - 1] = lastLine + line;
}
}

return sanitizedLines.join('\n');
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ import {
Transformer,
TRANSFORMERS,
} from '../..';
import {MultilineElementTransformer} from '../../MarkdownTransformers';
import {
MultilineElementTransformer,
normalizeMarkdown,
} from '../../MarkdownTransformers';

// Matches html within a mdx file
const MDX_HTML_TRANSFORMER: MultilineElementTransformer = {
Expand Down Expand Up @@ -92,19 +95,36 @@ describe('Markdown', () => {
html: '<h6><span style="white-space: pre-wrap;">Hello world</span></h6>',
md: '###### Hello world',
},
{
// Multiline paragraphs: https://spec.commonmark.org/dingus/?text=Hello%0Aworld%0A!
html: '<p><span style="white-space: pre-wrap;">Helloworld!</span></p>',
md: ['Hello', 'world', '!'].join('\n'),
skipExport: true,
},
{
// Multiline paragraphs
html: '<p><span style="white-space: pre-wrap;">Hello</span><br><span style="white-space: pre-wrap;">world</span><br><span style="white-space: pre-wrap;">!</span></p>',
// TO-DO: It would be nice to support also hard line breaks (<br>) as \ or double spaces
// See https://spec.commonmark.org/0.31.2/#hard-line-breaks.
// Example: '<p><span style="white-space: pre-wrap;">Hello\\\nworld\\\n!</span></p>',
html: '<p><span style="white-space: pre-wrap;">Hello<br>world<br>!</span></p>',
md: ['Hello', 'world', '!'].join('\n'),
skipImport: true,
},
{
html: '<blockquote><span style="white-space: pre-wrap;">Hello</span><br><span style="white-space: pre-wrap;">world!</span></blockquote>',
md: '> Hello\n> world!',
},
// TO-DO: <br> should be preserved
// {
// html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world<br>!<br>!</span></li></ul>',
// md: '- Hello\n- world<br>!<br>!',
// skipImport: true,
// },
{
// Multiline list items
html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world</span><br><span style="white-space: pre-wrap;">!</span><br><span style="white-space: pre-wrap;">!</span></li></ul>',
// Multiline list items: https://spec.commonmark.org/dingus/?text=-%20Hello%0A-%20world%0A!%0A!
html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world!!</span></li></ul>',
md: '- Hello\n- world\n!\n!',
skipExport: true,
},
{
html: '<ul><li value="1"><span style="white-space: pre-wrap;">Hello</span></li><li value="2"><span style="white-space: pre-wrap;">world</span></li></ul>',
Expand Down Expand Up @@ -274,8 +294,8 @@ describe('Markdown', () => {
skipExport: true,
},
{
// Import only: multiline quote will be prefixed with ">" on each line during export
html: '<blockquote><span style="white-space: pre-wrap;">Hello</span><br><span style="white-space: pre-wrap;">world</span><br><span style="white-space: pre-wrap;">!</span></blockquote>',
// https://spec.commonmark.org/dingus/?text=%3E%20Hello%0Aworld%0A!
html: '<blockquote><span style="white-space: pre-wrap;">Helloworld!</span></blockquote>',
md: '> Hello\nworld\n!',
skipExport: true,
},
Expand All @@ -298,8 +318,9 @@ describe('Markdown', () => {
},
{
customTransformers: [MDX_HTML_TRANSFORMER],
html: '<p><span style="white-space: pre-wrap;">Some HTML in mdx:</span></p><pre spellcheck="false" data-language="MyComponent"><span style="white-space: pre-wrap;">From HTML: Line 1\nSome Text</span></pre>',
html: '<p><span style="white-space: pre-wrap;">Some HTML in mdx:</span></p><pre spellcheck="false" data-language="MyComponent"><span style="white-space: pre-wrap;">From HTML: Line 1Some Text</span></pre>',
md: 'Some HTML in mdx:\n\n<MyComponent>Line 1\nSome Text</MyComponent>',
skipExport: true,
},
];

Expand Down Expand Up @@ -407,3 +428,47 @@ describe('Markdown', () => {
});
}
});

describe('normalizeMarkdown', () => {
it('should combine lines separated by a single \n unless they are in a codeblock', () => {
const markdown = `
1
2

3

\`\`\`md
1
2

3
\`\`\`

\`\`\`js
1
2

3
\`\`\`
`;
expect(normalizeMarkdown(markdown)).toBe(`
12

3

\`\`\`md
1
2

3
\`\`\`

\`\`\`js
1
2

3
\`\`\`
`);
});
});
4 changes: 3 additions & 1 deletion packages/lexical-markdown/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import {
ITALIC_STAR,
ITALIC_UNDERSCORE,
LINK,
normalizeMarkdown,
ORDERED_LIST,
QUOTE,
STRIKETHROUGH,
Expand Down Expand Up @@ -82,11 +83,12 @@ function $convertFromMarkdownString(
node?: ElementNode,
shouldPreserveNewLines = false,
): void {
const sanitizedMarkdown = normalizeMarkdown(markdown);
const importMarkdown = createMarkdownImport(
transformers,
shouldPreserveNewLines,
);
return importMarkdown(markdown, node);
return importMarkdown(sanitizedMarkdown, node);
}

/**
Expand Down
11 changes: 4 additions & 7 deletions packages/lexical-playground/__tests__/e2e/Markdown.spec.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -1310,7 +1310,6 @@ const IMPORTED_MARKDOWN_HTML = html`
bold italic strikethrough
</strong>
<span data-lexical-text="true">text,</span>
<br />
<strong
class="PlaygroundEditorTheme__textBold PlaygroundEditorTheme__textItalic PlaygroundEditorTheme__textStrikethrough"
data-lexical-text="true">
Expand Down Expand Up @@ -1408,9 +1407,7 @@ const IMPORTED_MARKDOWN_HTML = html`
dir="ltr">
<span data-lexical-text="true">Blockquotes text goes here</span>
<br />
<span data-lexical-text="true">And second</span>
<br />
<span data-lexical-text="true">line after</span>
<span data-lexical-text="true">And secondline after</span>
</blockquote>
<blockquote
class="PlaygroundEditorTheme__quote PlaygroundEditorTheme__ltr"
Expand Down Expand Up @@ -1488,9 +1485,9 @@ const IMPORTED_MARKDOWN_HTML = html`
class="PlaygroundEditorTheme__listItem PlaygroundEditorTheme__ltr"
dir="ltr"
value="1">
<span data-lexical-text="true">And can be nested</span>
<br />
<span data-lexical-text="true">and multiline as well</span>
<span data-lexical-text="true">
And can be nested and multiline as well
</span>
</li>
</ol>
</li>
Expand Down
Loading