diff --git a/.changeset/block-splitter-layer1.md b/.changeset/block-splitter-layer1.md new file mode 100644 index 0000000000000..32a757bf7705e --- /dev/null +++ b/.changeset/block-splitter-layer1.md @@ -0,0 +1,5 @@ +--- +"@rocket.chat/message-parser": patch +--- + +feat(message-parser): implement BlockSplitter PoC (Layer 1) diff --git a/packages/message-parser/benchmarks/parser.bench.ts b/packages/message-parser/benchmarks/parser.bench.ts index 107c3847ccd90..43cef94488e25 100644 --- a/packages/message-parser/benchmarks/parser.bench.ts +++ b/packages/message-parser/benchmarks/parser.bench.ts @@ -61,6 +61,8 @@ const categories: BenchCategory[] = [ { name: 'strike', input: '~~Hello world~~' }, { name: 'nested', input: '**bold _italic_ and ~~strike~~**' }, { name: 'deep nesting', input: '**bold _italic ~~strike _deep italic_~~_**' }, + { name: 'bold + italic mixed', input: '*Bold text* and _italic text_ in a message' }, + { name: 'deeply nested', input: '*bold _italic ~~strike~~ italic_ bold*' }, { name: 'multiple', input: '**bold** normal _italic_ normal ~~strike~~ **more bold** _more italic_' }, ], }, @@ -83,6 +85,7 @@ const categories: BenchCategory[] = [ { name: 'triple unicode (BigEmoji)', input: '😀🚀🌈', options: fullOptions }, { name: 'in text', input: 'Hello :smile: world :heart: test :rocket: done', options: fullOptions }, { name: 'mixed', input: 'Great job :thumbsup: 🎉 keep going :rocket:', options: fullOptions }, + { name: 'emoji heavy', input: ':smile: :wave: :rocket: :fire: :heart: :100:', options: fullOptions }, ], }, { @@ -92,6 +95,7 @@ const categories: BenchCategory[] = [ { name: 'multiple users', input: '@admin @user1 @moderator' }, { name: 'channel', input: '#general' }, { name: 'mixed', input: 'Hey @admin check #general and @user1' }, + { name: 'mentions (suggested)', input: 'Hey @john and @jane, check #general' }, ], }, { @@ -141,6 +145,10 @@ const categories: BenchCategory[] = [ name: 'long with formatting', input: '**bold** _italic_ ~~strike~~ `code` @user #channel :smile: https://example.com '.repeat(10).trim(), }, + { + name: 'unmatched markers (pathological)', + input: '*_~*_~*_~*_~*_~ hello world absolutely no closing markers anywhere at all', + }, ], }, { @@ -158,6 +166,11 @@ const categories: BenchCategory[] = [ '**Release Notes v7.0**\n- [x] Fix #12345\n- [ ] Update docs\n\n> Important: check https://docs.rocket.chat\n\ncc @admin @devlead #releases :rocket:', options: fullOptions, }, + { + name: 'realistic chat message', + input: 'Hello @team, please review the *important* update:\n\n1. Run `yarn build`\n2. Check #deployments\n\n*Thanks!* :rocket:', + options: fullOptions, + }, ], }, { diff --git a/packages/message-parser/src/BlockSplitter.ts b/packages/message-parser/src/BlockSplitter.ts new file mode 100644 index 0000000000000..9b59cf2e759e3 --- /dev/null +++ b/packages/message-parser/src/BlockSplitter.ts @@ -0,0 +1,212 @@ +export enum BlockType { + PARAGRAPH = 'PARAGRAPH', + HEADING = 'HEADING', + CODE = 'CODE', + LIST = 'LIST', + QUOTE = 'QUOTE', +} + +export type Block = { + type: BlockType; + content: string; + level?: number; + language?: string; + ordered?: boolean; + incomplete?: boolean; +}; + +export class BlockSplitter { + public static split(input: string): Block[] { + const lines = input.split(/\r?\n/); + const blocks: Block[] = []; + let currentBlock: Block | null = null; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + + // Check for heading: # ## ### #### + const headingResult = this.parseHeading(line); + if (headingResult) { + this.flush(blocks, currentBlock); + currentBlock = { + type: BlockType.HEADING, + content: headingResult.content, + level: headingResult.level, + }; + this.flush(blocks, currentBlock); + currentBlock = null; + continue; + } + + if (line.startsWith('```')) { + this.flush(blocks, currentBlock); + const language = line.slice(3).trim(); + const codeLines = []; + let closed = false; + i++; + while (i < lines.length && !lines[i].startsWith('```')) { + codeLines.push(lines[i]); + i++; + } + if (i < lines.length) { + closed = true; + } + blocks.push({ + type: BlockType.CODE, + content: codeLines.join('\n'), + language, + incomplete: !closed, + }); + currentBlock = null; + continue; + } + + // Check for blank line - don't flush lists if the blank line has leading spaces + const isBlank = line.trim() === ''; + if (isBlank) { + const hasLeadingSpaces = line.length > 0 && line.charCodeAt(0) === 32; // ' ' + if (!(hasLeadingSpaces && currentBlock?.type === BlockType.LIST)) { + this.flush(blocks, currentBlock); + currentBlock = null; + } + continue; + } + + // Check for list item + const listResult = this.parseListItem(line); + const isIndented = line.length > 0 && line.charCodeAt(0) === 32; + + if (listResult) { + if (currentBlock?.type !== BlockType.LIST) { + this.flush(blocks, currentBlock); + currentBlock = { + type: BlockType.LIST, + content: line, + ordered: listResult.isOrdered, + }; + } else { + if (currentBlock.ordered !== undefined && currentBlock.ordered !== listResult.isOrdered) { + currentBlock.ordered = undefined; + } + currentBlock.content += `\n${line}`; + } + continue; + } + + if (isIndented && currentBlock?.type === BlockType.LIST) { + currentBlock.content += `\n${line}`; + continue; + } + + if (line.startsWith('>')) { + if (currentBlock?.type !== BlockType.QUOTE) { + this.flush(blocks, currentBlock); + currentBlock = { + type: BlockType.QUOTE, + content: line, + }; + } else { + currentBlock.content += `\n${line}`; + } + continue; + } + + if (currentBlock?.type !== BlockType.PARAGRAPH) { + this.flush(blocks, currentBlock); + currentBlock = { + type: BlockType.PARAGRAPH, + content: line, + }; + } else { + currentBlock.content += `\n${line}`; + } + } + + this.flush(blocks, currentBlock); + return blocks; + } + + private static parseHeading(line: string): { level: number; content: string } | null { + let level = 0; + let pos = 0; + + // Count leading '#' characters (max 6 for heading) + while (pos < line.length && line.charCodeAt(pos) === 35 /* '#' */ && level < 6) { + level++; + pos++; + } + + if (level === 0) { + return null; + } + + // Must have at least one space after '#' + if (pos >= line.length || line.charCodeAt(pos) !== 32 /* ' ' */) { + return null; + } + + // Skip the space and get content + pos++; + const content = line.slice(pos); + + // Content must not be empty + if (content.length === 0) { + return null; + } + + return { level, content }; + } + + private static parseListItem(line: string): { isOrdered: boolean } | null { + let pos = 0; + + // Skip leading spaces + while (pos < line.length && line.charCodeAt(pos) === 32 /* ' ' */) { + pos++; + } + + const start = pos; + + // Check for ordered list (digits followed by '.') + if (pos < line.length && line.charCodeAt(pos) >= 48 && line.charCodeAt(pos) <= 57 /* 0-9 */) { + while (pos < line.length && line.charCodeAt(pos) >= 48 && line.charCodeAt(pos) <= 57) { + pos++; + } + if (pos < line.length && line.charCodeAt(pos) === 46 /* '.' */) { + pos++; + // Must have space after '.' + if (pos < line.length && line.charCodeAt(pos) === 32 /* ' ' */) { + pos++; + // Must have content after space + if (pos < line.length) { + return { isOrdered: true }; + } + } + } + // Reset if ordered list pattern didn't match + pos = start; + } + + // Check for unordered list (-, *, or +) + const char = line.charCodeAt(pos); + if (char === 45 /* '-' */ || char === 42 /* '*' */ || char === 43 /* '+' */) { + pos++; + // Must have space after marker + if (pos < line.length && line.charCodeAt(pos) === 32 /* ' ' */) { + pos++; + // Must have content after space + if (pos < line.length) { + return { isOrdered: false }; + } + } + } + + return null; + } + + private static flush(blocks: Block[], block: Block | null) { + if (block) { + blocks.push(block); + } + } +} diff --git a/packages/message-parser/tests/blockSplitter.spec.ts b/packages/message-parser/tests/blockSplitter.spec.ts new file mode 100644 index 0000000000000..9f167c75690e9 --- /dev/null +++ b/packages/message-parser/tests/blockSplitter.spec.ts @@ -0,0 +1,141 @@ +import { BlockSplitter, BlockType } from '../src/BlockSplitter'; + +describe('BlockSplitter', () => { + it('should split simple paragraphs', () => { + const input = 'Hello\nWorld'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.PARAGRAPH); + expect(blocks[0].content).toBe('Hello\nWorld'); + }); + + it('should identify headings', () => { + const input = '# Heading 1\n## Heading 2\nContent'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(3); + expect(blocks[0].type).toBe(BlockType.HEADING); + expect(blocks[0].level).toBe(1); + expect(blocks[1].type).toBe(BlockType.HEADING); + expect(blocks[1].level).toBe(2); + }); + + it('should identify code blocks', () => { + const input = 'Pre\n```javascript\nconst a = 1;\n```\nPost'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(3); + expect(blocks[1].type).toBe(BlockType.CODE); + expect(blocks[1].language).toBe('javascript'); + expect(blocks[1].content).toBe('const a = 1;'); + }); + + it('should handle list splitting and preserve full syntax', () => { + const input = '- item 1\n* item 2\n1. item 3'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.LIST); + expect(blocks[0].content).toBe('- item 1\n* item 2\n1. item 3'); + }); + + it('should handle nested lists via indentation', () => { + const input = '- Level 1\n - Level 2\n - Level 3'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].content).toBe('- Level 1\n - Level 2\n - Level 3'); + }); + + it('should allow indented blank lines to continue a list', () => { + const input = '- item 1\n \n- item 2'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].content).toBe('- item 1\n \n- item 2'); + }); + + it('should correctly detect boundaries: list followed by heading', () => { + const input = '- list item\n\n# Heading'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(2); + expect(blocks[0].type).toBe(BlockType.LIST); + expect(blocks[1].type).toBe(BlockType.HEADING); + }); + + it('should identify blockquotes and preserve markers', () => { + const input = '> quote line 1\n> quote line 2'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.QUOTE); + expect(blocks[0].content).toBe('> quote line 1\n> quote line 2'); + }); + + it('should support nested blockquotes', () => { + const input = '> outer\n>> inner'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.QUOTE); + expect(blocks[0].content).toBe('> outer\n>> inner'); + }); + + it('should set ordered to undefined for mixed ordered and unordered list items', () => { + const input = '- unordered\n1. ordered'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.LIST); + expect(blocks[0].ordered).toBeUndefined(); + }); + + it('should keep ordered=true for fully ordered lists', () => { + const input = '1. first\n2. second'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.LIST); + expect(blocks[0].ordered).toBe(true); + }); + + it('should keep ordered=false for fully unordered lists', () => { + const input = '- first\n* second'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.LIST); + expect(blocks[0].ordered).toBe(false); + }); + + it('should create a new paragraph block after a list block', () => { + const input = '- list item\n\nParagraph text'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(2); + expect(blocks[0].type).toBe(BlockType.LIST); + expect(blocks[1].type).toBe(BlockType.PARAGRAPH); + expect(blocks[1].content).toBe('Paragraph text'); + }); + + it('should create a new paragraph block after a quote block', () => { + const input = '> blockquote\n\nParagraph text'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(2); + expect(blocks[0].type).toBe(BlockType.QUOTE); + expect(blocks[1].type).toBe(BlockType.PARAGRAPH); + expect(blocks[1].content).toBe('Paragraph text'); + }); + + it('should handle empty input correctly', () => { + const input = ''; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(0); + }); + + it('should yield a CODE block with incomplete flag for an unclosed code fence', () => { + const input = '```js\ncode'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.CODE); + expect(blocks[0].content).toBe('code'); + expect(blocks[0].incomplete).toBe(true); + }); + + it('should treat a heading without a space as a paragraph', () => { + const input = '#NoSpace'; + const blocks = BlockSplitter.split(input); + expect(blocks.length).toBe(1); + expect(blocks[0].type).toBe(BlockType.PARAGRAPH); + expect(blocks[0].content).toBe('#NoSpace'); + }); +}); diff --git a/packages/message-parser/tests/skip-flags-regression.spec.ts b/packages/message-parser/tests/skip-flags-regression.spec.ts new file mode 100644 index 0000000000000..54de6ecc9ea0b --- /dev/null +++ b/packages/message-parser/tests/skip-flags-regression.spec.ts @@ -0,0 +1,22 @@ +import { parse } from '../src'; + +describe('Skip Flags Regression (Complexity Audit)', () => { + const measureDepth = (depth: number) => { + const input = `${'*'.repeat(depth)}text${'*'.repeat(depth)}`; + const start = performance.now(); + parse(input); + return performance.now() - start; + }; + + it('should parse nested formatting across multiple depths without throwing', () => { + for (let d = 1; d <= 50; d++) { + const input = `${'*'.repeat(d)}text${'*'.repeat(d)}`; + expect(() => parse(input)).not.toThrow(); + } + }); + + it('should handle pathological unmatched markers without crashing', () => { + const pathological = '*_~*_~*_~*_~*_~ hello'.repeat(5); + expect(() => parse(pathological)).not.toThrow(); + }); +});