Skip to content

Commit dad536e

Browse files
committed
fix(robot): match longest BDD prefix first for multi-word prefixes
French BDD prefixes like "Étant donné que", "Et que", "Mais que" were not fully recognized because shorter prefixes (e.g. "Et") matched before longer ones (e.g. "Et que"). Replaced manual iteration and set lookups with a regex-based approach matching Robot Framework's own strategy: prefixes sorted by length (longest first), compiled into a cached regex pattern. This applies to keyword_finder, model_helper (split/strip/is_bdd), and semantic tokens. Fixes #560
1 parent ee9993d commit dad536e

File tree

5 files changed

+507
-83
lines changed

5 files changed

+507
-83
lines changed

packages/language_server/src/robotcode/language_server/robotframework/parts/semantic_tokens.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
SemanticTokenTypes,
5757
)
5858
from robotcode.core.text_document import TextDocument, range_to_utf16
59-
from robotcode.robot.diagnostics.keyword_finder import DEFAULT_BDD_PREFIXES
59+
from robotcode.robot.diagnostics.keyword_finder import DEFAULT_BDD_PREFIX_REGEXP, build_bdd_prefix_regexp
6060
from robotcode.robot.diagnostics.library_doc import (
6161
ALL_RUN_KEYWORDS_MATCHERS,
6262
BUILTIN_LIBRARY_NAME,
@@ -1116,24 +1116,15 @@ def generate_sem_sub_tokens(
11161116
if bdd_match:
11171117
bdd_len = len(bdd_match.group(1))
11181118
else:
1119-
bdd_prefixes = (
1120-
namespace.languages.bdd_prefixes
1119+
bdd_regexp = (
1120+
build_bdd_prefix_regexp(frozenset(namespace.languages.bdd_prefixes))
11211121
if namespace.languages is not None
1122-
else DEFAULT_BDD_PREFIXES
1122+
else DEFAULT_BDD_PREFIX_REGEXP
11231123
)
11241124

1125-
for prefix in bdd_prefixes:
1126-
if token.value.startswith(prefix + " "):
1127-
bdd_len = len(prefix)
1128-
break
1129-
else:
1130-
parts = token.value.split()
1131-
if len(parts) > 1:
1132-
for index in range(1, len(parts)):
1133-
prefix = " ".join(parts[:index]).title()
1134-
if prefix in bdd_prefixes:
1135-
bdd_len = len(prefix)
1136-
break
1125+
bdd_match = bdd_regexp.match(token.value)
1126+
if bdd_match:
1127+
bdd_len = len(bdd_match.group(1))
11371128

11381129
if bdd_len > 0:
11391130
yield SemTokenInfo.from_token(

packages/robot/src/robotcode/robot/diagnostics/keyword_finder.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import functools
22
import re
33
from itertools import chain
4-
from typing import Dict, Iterable, Iterator, List, NamedTuple, Optional, Sequence, Tuple
4+
from typing import Dict, FrozenSet, Iterable, Iterator, List, NamedTuple, Optional, Sequence, Tuple
55

66
from robot.libraries import STDLIBS
77
from robotcode.core.lsp.types import (
@@ -38,6 +38,16 @@ class CancelSearchError(Exception):
3838
DEFAULT_BDD_PREFIXES = {"Given ", "When ", "Then ", "And ", "But "}
3939

4040

41+
@functools.lru_cache(maxsize=None)
42+
def build_bdd_prefix_regexp(prefixes: FrozenSet[str]) -> "re.Pattern[str]":
43+
sorted_prefixes = sorted(prefixes, key=len, reverse=True)
44+
pattern = "|".join(p.strip().replace(" ", r"\s") for p in sorted_prefixes).lower()
45+
return re.compile(rf"({pattern})\s", re.IGNORECASE)
46+
47+
48+
DEFAULT_BDD_PREFIX_REGEXP = build_bdd_prefix_regexp(frozenset(DEFAULT_BDD_PREFIXES))
49+
50+
4151
class KeywordFinder:
4252
def __init__(
4353
self,
@@ -501,14 +511,9 @@ def _create_custom_and_standard_keyword_conflict_warning_message(
501511

502512
@functools.cached_property
503513
def bdd_prefix_regexp(self) -> "re.Pattern[str]":
504-
prefixes = (
505-
"|".join(
506-
self._languages.bdd_prefixes if self._languages is not None else ["given", "when", "then", "and", "but"]
507-
)
508-
.replace(" ", r"\s")
509-
.lower()
510-
)
511-
return re.compile(rf"({prefixes})\s", re.IGNORECASE)
514+
if self._languages is not None:
515+
return build_bdd_prefix_regexp(frozenset(self._languages.bdd_prefixes))
516+
return DEFAULT_BDD_PREFIX_REGEXP
512517

513518
def _get_bdd_style_keyword(self, name: str) -> Optional[KeywordDoc]:
514519
match = self.bdd_prefix_regexp.match(name)

packages/robot/src/robotcode/robot/diagnostics/model_helper.py

Lines changed: 38 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
VariableDefinition,
4040
VariableNotFoundDefinition,
4141
)
42-
from .keyword_finder import DEFAULT_BDD_PREFIXES
42+
from .keyword_finder import DEFAULT_BDD_PREFIX_REGEXP, build_bdd_prefix_regexp
4343
from .library_doc import (
4444
ArgumentInfo,
4545
KeywordArgumentKind,
@@ -562,38 +562,35 @@ def get_expression_statement_types(cls) -> Tuple[Type[Any], ...]:
562562
BDD_TOKEN_REGEX = re.compile(r"^(Given|When|Then|And|But)\s", flags=re.IGNORECASE)
563563
BDD_TOKEN = re.compile(r"^(Given|When|Then|And|But)$", flags=re.IGNORECASE)
564564

565+
@classmethod
566+
def _get_bdd_prefix_regexp(cls, namespace: "Namespace") -> "re.Pattern[str]":
567+
if namespace.languages is not None:
568+
return build_bdd_prefix_regexp(frozenset(namespace.languages.bdd_prefixes))
569+
return DEFAULT_BDD_PREFIX_REGEXP
570+
565571
@classmethod
566572
def split_bdd_prefix(cls, namespace: "Namespace", token: Token) -> Tuple[Optional[Token], Optional[Token]]:
567-
bdd_token = None
568-
569-
parts = token.value.split()
570-
if len(parts) < 2:
571-
return None, token
572-
573-
for index in range(1, len(parts)):
574-
prefix = " ".join(parts[:index]).title()
575-
if prefix in (
576-
namespace.languages.bdd_prefixes if namespace.languages is not None else DEFAULT_BDD_PREFIXES
577-
):
578-
bdd_len = len(prefix)
579-
bdd_token = Token(
580-
token.type,
581-
token.value[:bdd_len],
582-
token.lineno,
583-
token.col_offset,
584-
token.error,
585-
)
573+
bdd_match = cls._get_bdd_prefix_regexp(namespace).match(token.value)
574+
if bdd_match:
575+
bdd_len = len(bdd_match.group(1))
576+
bdd_token = Token(
577+
token.type,
578+
token.value[:bdd_len],
579+
token.lineno,
580+
token.col_offset,
581+
token.error,
582+
)
586583

587-
token = Token(
588-
token.type,
589-
token.value[bdd_len + 1 :],
590-
token.lineno,
591-
token.col_offset + bdd_len + 1,
592-
token.error,
593-
)
594-
break
584+
token = Token(
585+
token.type,
586+
token.value[bdd_len + 1 :],
587+
token.lineno,
588+
token.col_offset + bdd_len + 1,
589+
token.error,
590+
)
591+
return bdd_token, token
595592

596-
return bdd_token, token
593+
return None, token
597594

598595
@classmethod
599596
def strip_bdd_prefix(cls, namespace: "Namespace", token: Token) -> Token:
@@ -611,24 +608,16 @@ def strip_bdd_prefix(cls, namespace: "Namespace", token: Token) -> Token:
611608
)
612609
return token
613610

614-
parts = token.value.split()
615-
if len(parts) < 2:
616-
return token
617-
618-
for index in range(1, len(parts)):
619-
prefix = " ".join(parts[:index]).title()
620-
if prefix in (
621-
namespace.languages.bdd_prefixes if namespace.languages is not None else DEFAULT_BDD_PREFIXES
622-
):
623-
bdd_len = len(prefix)
624-
token = Token(
625-
token.type,
626-
token.value[bdd_len + 1 :],
627-
token.lineno,
628-
token.col_offset + bdd_len + 1,
629-
token.error,
630-
)
631-
break
611+
bdd_match = cls._get_bdd_prefix_regexp(namespace).match(token.value)
612+
if bdd_match:
613+
bdd_len = len(bdd_match.group(1))
614+
token = Token(
615+
token.type,
616+
token.value[bdd_len + 1 :],
617+
token.lineno,
618+
token.col_offset + bdd_len + 1,
619+
token.error,
620+
)
632621

633622
return token
634623

@@ -638,17 +627,8 @@ def is_bdd_token(cls, namespace: "Namespace", token: Token) -> bool:
638627
bdd_match = cls.BDD_TOKEN.match(token.value)
639628
return bool(bdd_match)
640629

641-
parts = token.value.split()
642-
643-
for index in range(len(parts)):
644-
prefix = " ".join(parts[: index + 1]).title()
645-
646-
if prefix.title() in (
647-
namespace.languages.bdd_prefixes if namespace.languages is not None else DEFAULT_BDD_PREFIXES
648-
):
649-
return True
650-
651-
return False
630+
bdd_match = cls._get_bdd_prefix_regexp(namespace).match(token.value + " ")
631+
return bdd_match is not None and len(bdd_match.group(1)) == len(token.value)
652632

653633
@classmethod
654634
def get_keyword_definition_at_token(cls, library_doc: LibraryDoc, token: Token) -> Optional[KeywordDoc]:

tests/robotcode/language_server/robotframework/parts/test_semantic_tokens_unit.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,5 +1842,142 @@ def test_complex_library_imports_and_usage(self) -> None:
18421842
assert successful_tokens >= total_tokens * 0.7 # 70% success rate minimum
18431843

18441844

1845+
class TestBddPrefixSemanticTokens:
1846+
"""Test that generate_sem_sub_tokens correctly emits BDD_PREFIX tokens for multi-word and French prefixes."""
1847+
1848+
FRENCH_BDD_PREFIXES = {
1849+
"Étant Donné",
1850+
"Étant Donné Que",
1851+
"Étant Donné Qu'",
1852+
"Soit",
1853+
"Sachant Que",
1854+
"Sachant Qu'",
1855+
"Sachant",
1856+
"Etant Donné",
1857+
"Etant Donné Que",
1858+
"Etant Donné Qu'",
1859+
"Etant Donnée",
1860+
"Etant Données",
1861+
"Lorsque",
1862+
"Quand",
1863+
"Lorsqu'",
1864+
"Alors",
1865+
"Donc",
1866+
"Et",
1867+
"Et Que",
1868+
"Et Qu'",
1869+
"Mais",
1870+
"Mais Que",
1871+
"Mais Qu'",
1872+
}
1873+
1874+
@staticmethod
1875+
def _make_namespace(bdd_prefixes: Any = None) -> Any:
1876+
ns = type(
1877+
"MockNamespace",
1878+
(),
1879+
{
1880+
"find_keyword": lambda self, name, **kwargs: None,
1881+
"languages": None,
1882+
"namespaces": {},
1883+
},
1884+
)()
1885+
if bdd_prefixes is not None:
1886+
ns.languages = type("MockLanguages", (), {"bdd_prefixes": bdd_prefixes})()
1887+
return ns
1888+
1889+
def setup_method(self) -> None:
1890+
self.generator = SemanticTokenGenerator()
1891+
1892+
def _get_bdd_tokens(self, token_value: str, bdd_prefixes: Any = None) -> list[SemTokenInfo]:
1893+
token = Token(Token.KEYWORD, token_value, 1, 4)
1894+
node = KeywordCall([])
1895+
ns = self._make_namespace(bdd_prefixes)
1896+
return list(self.generator.generate_sem_sub_tokens(ns, None, token, node))
1897+
1898+
def _assert_bdd_prefix_token(self, tokens: list[SemTokenInfo], expected_prefix: str, start_col: int = 4) -> None:
1899+
bdd_tokens = [t for t in tokens if t.sem_token_type == RobotSemTokenTypes.BDD_PREFIX]
1900+
assert len(bdd_tokens) == 1, (
1901+
f"Expected exactly 1 BDD_PREFIX token for '{expected_prefix}', "
1902+
f"got {len(bdd_tokens)}: {[(t.col_offset, t.length) for t in bdd_tokens]}"
1903+
)
1904+
bdd = bdd_tokens[0]
1905+
assert bdd.col_offset == start_col
1906+
assert bdd.length == len(expected_prefix), (
1907+
f"Expected BDD prefix length {len(expected_prefix)} for '{expected_prefix}', got {bdd.length}"
1908+
)
1909+
1910+
@pytest.mark.parametrize(
1911+
("text", "expected_prefix"),
1912+
[
1913+
("Given something", "Given"),
1914+
("When something", "When"),
1915+
("Then something", "Then"),
1916+
("And something", "And"),
1917+
("But something", "But"),
1918+
],
1919+
)
1920+
def test_english_bdd_prefix_tokens(self, text: str, expected_prefix: str) -> None:
1921+
tokens = self._get_bdd_tokens(text)
1922+
self._assert_bdd_prefix_token(tokens, expected_prefix)
1923+
1924+
@pytest.mark.skipif(RF_VERSION < (6, 0), reason="Language support requires RF >= 6.0")
1925+
@pytest.mark.parametrize(
1926+
("text", "expected_prefix"),
1927+
[
1928+
("Et que My Keyword", "Et que"),
1929+
("Et My Keyword", "Et"),
1930+
("Étant donné que My Keyword", "Étant donné que"),
1931+
("Étant donné My Keyword", "Étant donné"),
1932+
("Mais que My Keyword", "Mais que"),
1933+
("Mais My Keyword", "Mais"),
1934+
("Sachant que My Keyword", "Sachant que"),
1935+
("Sachant My Keyword", "Sachant"),
1936+
("Lorsque My Keyword", "Lorsque"),
1937+
("Alors My Keyword", "Alors"),
1938+
("Donc My Keyword", "Donc"),
1939+
("Etant donné que My Keyword", "Etant donné que"),
1940+
],
1941+
)
1942+
def test_french_bdd_prefix_tokens(self, text: str, expected_prefix: str) -> None:
1943+
tokens = self._get_bdd_tokens(text, self.FRENCH_BDD_PREFIXES)
1944+
self._assert_bdd_prefix_token(tokens, expected_prefix)
1945+
1946+
@pytest.mark.skipif(RF_VERSION < (6, 0), reason="Language support requires RF >= 6.0")
1947+
def test_french_no_bdd_prefix(self) -> None:
1948+
tokens = self._get_bdd_tokens("My Keyword", self.FRENCH_BDD_PREFIXES)
1949+
bdd_tokens = [t for t in tokens if t.sem_token_type == RobotSemTokenTypes.BDD_PREFIX]
1950+
assert len(bdd_tokens) == 0
1951+
1952+
@pytest.mark.skipif(RF_VERSION < (6, 0), reason="Language support requires RF >= 6.0")
1953+
def test_french_longest_prefix_wins_et_que_vs_et(self) -> None:
1954+
"""Regression: 'Et que' must match as full prefix, not just 'Et'."""
1955+
tokens = self._get_bdd_tokens("Et que My Keyword", self.FRENCH_BDD_PREFIXES)
1956+
self._assert_bdd_prefix_token(tokens, "Et que")
1957+
1958+
@pytest.mark.skipif(RF_VERSION < (6, 0), reason="Language support requires RF >= 6.0")
1959+
def test_french_longest_prefix_wins_mais_que_vs_mais(self) -> None:
1960+
"""Regression: 'Mais que' must match as full prefix, not just 'Mais'."""
1961+
tokens = self._get_bdd_tokens("Mais que My Keyword", self.FRENCH_BDD_PREFIXES)
1962+
self._assert_bdd_prefix_token(tokens, "Mais que")
1963+
1964+
@pytest.mark.skipif(RF_VERSION < (6, 0), reason="Language support requires RF >= 6.0")
1965+
def test_french_longest_prefix_wins_etant_donne_que_vs_etant_donne(self) -> None:
1966+
"""Regression: 'Étant donné que' must match as full prefix, not just 'Étant donné'."""
1967+
tokens = self._get_bdd_tokens("Étant donné que My Keyword", self.FRENCH_BDD_PREFIXES)
1968+
self._assert_bdd_prefix_token(tokens, "Étant donné que")
1969+
1970+
@pytest.mark.skipif(RF_VERSION < (6, 0), reason="Language support requires RF >= 6.0")
1971+
def test_separator_token_after_bdd_prefix(self) -> None:
1972+
"""The space after BDD prefix must also be emitted as a token."""
1973+
tokens = self._get_bdd_tokens("Et que My Keyword", self.FRENCH_BDD_PREFIXES)
1974+
bdd_tokens = [t for t in tokens if t.sem_token_type == RobotSemTokenTypes.BDD_PREFIX]
1975+
assert len(bdd_tokens) == 1
1976+
bdd = bdd_tokens[0]
1977+
# Separator token is the space between prefix and keyword, length=1
1978+
sep_candidates = [t for t in tokens if t.col_offset == bdd.col_offset + bdd.length and t.length == 1]
1979+
assert len(sep_candidates) == 1, "Expected a separator token (space) after BDD prefix"
1980+
1981+
18451982
if __name__ == "__main__":
18461983
pytest.main([__file__])

0 commit comments

Comments
 (0)