Skip to content

Commit 5cf4c4d

Browse files
authored
Tables: tolerate trailing whitespace on rows; reject empty delimiter cells (#247)
Two table delimiter-row divergences (carve-js and carve-rs agreed; the PHP impl was the outlier), ported from carve-php commit 28d4c10: 1. Trailing whitespace after a row's closing pipe broke recognition. A line like `| a |` followed by spaces, or a separator `|---|` followed by spaces, was not treated as a table row, so a table with a trailing-whitespace separator split into separate blocks and the separator rendered as a paragraph. Trailing spaces/tabs after the closing pipe are now stripped before the structural checks (isTableRow, isSeparatorRow, the cell parsers, and the BlockParser row loop). 2. A delimiter row with an empty cell (`|---||`) was accepted as a header separator. isSeparatorRow used a character class that put `|` inside it and so never validated per cell. It now splits the row into cells and requires each to be a delimiter cell (optional whitespace, optional leading `:`, one or more `-`, optional trailing `:`); an empty cell or any other content disqualifies the row, which then stays an ordinary data row. Behavior now matches the JS and Rust implementations on these delimiter-row edge cases.
1 parent 39c56cc commit 5cf4c4d

3 files changed

Lines changed: 92 additions & 2 deletions

File tree

src/Parser/Block/TableParser.php

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ public function isTableRow(string $line): bool
3434
return false;
3535
}
3636

37+
// Trailing whitespace after the closing pipe is insignificant (parity
38+
// with carve-js / carve-rs); strip it before the structural checks.
39+
$line = rtrim($line, " \t");
40+
3741
// Strip row attributes if present (|...|{.class})
3842
$lineWithoutRowAttrs = $this->stripRowAttributes($line);
3943

@@ -88,7 +92,29 @@ public function extractRowAttributes(string $line): array
8892
*/
8993
public function isSeparatorRow(string $line): bool
9094
{
91-
return preg_match('/^\|[\s:|-]+\|$/', $line) === 1 && str_contains($line, '-');
95+
// Trailing whitespace after the closing pipe is insignificant.
96+
$line = rtrim($line, " \t");
97+
98+
$len = strlen($line);
99+
if ($len < 2 || $line[0] !== '|' || $line[$len - 1] !== '|') {
100+
return false;
101+
}
102+
103+
// Every cell must be a delimiter cell: optional whitespace, an optional
104+
// leading ':', one or more '-', an optional trailing ':', optional
105+
// whitespace. An EMPTY cell (`|---||`) or any other content disqualifies
106+
// the row -- it is then an ordinary data row (matches carve-js/carve-rs).
107+
$cells = $this->parseTableCells($line);
108+
if ($cells === []) {
109+
return false;
110+
}
111+
foreach ($cells as $cell) {
112+
if (preg_match('/^\s*:?-+:?\s*$/', $cell) !== 1) {
113+
return false;
114+
}
115+
}
116+
117+
return true;
92118
}
93119

94120
/**
@@ -152,6 +178,9 @@ public function parseTableCells(string $line): array
152178
// Strip row attributes first
153179
$line = $this->stripRowAttributes($line);
154180

181+
// Trailing whitespace after the closing pipe is insignificant.
182+
$line = rtrim($line, " \t");
183+
155184
// Remove leading and trailing |
156185
$line = substr($line, 1, -1);
157186

@@ -357,6 +386,9 @@ public function parseTableCellsRaw(string $line): array
357386
// Strip row attributes first
358387
$line = $this->stripRowAttributes($line);
359388

389+
// Trailing whitespace after the closing pipe is insignificant.
390+
$line = rtrim($line, " \t");
391+
360392
// Must start with | to be a potential table row
361393
if (!str_starts_with($line, '|')) {
362394
return [];

src/Parser/BlockParser.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2758,7 +2758,9 @@ protected function tryParseTable(Node $parent, array $lines, int $start): ?int
27582758
// Strip row attributes for validation (|...|{.class} → |...|)
27592759
$lineWithoutRowAttrs = $this->tableParser->stripRowAttributes($currentLine);
27602760

2761-
if (!preg_match('/^\|.*\|$/', $lineWithoutRowAttrs)) {
2761+
// Trailing whitespace after the closing pipe is insignificant
2762+
// (parity with carve-js / carve-rs).
2763+
if (!preg_match('/^\|.*\|[ \t]*$/', $lineWithoutRowAttrs)) {
27622764
break;
27632765
}
27642766

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Djot\Test\TestCase;
6+
7+
use Djot\DjotConverter;
8+
use PHPUnit\Framework\TestCase;
9+
10+
/**
11+
* Tests for table delimiter (separator) row edge cases.
12+
*
13+
* - Trailing whitespace after a row's closing pipe is insignificant.
14+
* - A delimiter row with an empty cell (|---||) is not a delimiter row.
15+
*
16+
* Ported from carve-php (parity with carve-js / carve-rs).
17+
*/
18+
class TableDelimiterRowTest extends TestCase
19+
{
20+
protected DjotConverter $converter;
21+
22+
protected function setUp(): void
23+
{
24+
$this->converter = new DjotConverter();
25+
}
26+
27+
public function testSeparatorRowWithTrailingWhitespaceStillPromotesHeader(): void
28+
{
29+
// Trailing whitespace after the closing pipe is insignificant; the
30+
// separator must still promote the first row to a header.
31+
$result = $this->converter->convert("| H | G |\n|---| \n| a | b |");
32+
33+
$this->assertStringContainsString('<table>', $result);
34+
$this->assertStringContainsString('<th>H</th>', $result);
35+
$this->assertStringContainsString('<th>G</th>', $result);
36+
$this->assertStringNotContainsString('<p>', $result);
37+
}
38+
39+
public function testDataRowWithTrailingWhitespaceStillParsesAsTable(): void
40+
{
41+
$result = $this->converter->convert('| a | ');
42+
43+
$this->assertStringContainsString('<td>a</td>', $result);
44+
$this->assertStringNotContainsString('<p>', $result);
45+
}
46+
47+
public function testSeparatorRowWithEmptyCellIsNotASeparator(): void
48+
{
49+
// `|---||` has an empty second cell, so it is NOT a delimiter row: the
50+
// first row must not be promoted to a header.
51+
$result = $this->converter->convert("| H | G |\n|---||\n| a | b |");
52+
53+
$this->assertStringNotContainsString('<th>', $result);
54+
$this->assertStringContainsString('<td>H</td>', $result);
55+
}
56+
}

0 commit comments

Comments
 (0)