Skip to content

Commit 7cefede

Browse files
facontidavideclaude
andcommitted
Add hand-written tokenizer for scripting language
Implement BT::Scripting::tokenize() to replace the lexy-based tokenizer that was removed. Handles all token types: identifiers (with @ prefix for root blackboard vars), numbers (int/hex/real/exponent), single-quoted strings, two-char operators, and single-char operators/delimiters. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 4bbec24 commit 7cefede

1 file changed

Lines changed: 360 additions & 0 deletions

File tree

src/script_tokenizer.cpp

Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
/* Copyright (C) 2022-2025 Davide Faconti - All Rights Reserved
2+
*
3+
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
4+
* to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
5+
* and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6+
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7+
*
8+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
9+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
10+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11+
*/
12+
13+
#include "behaviortree_cpp/scripting/any_types.hpp"
14+
15+
#include <cctype>
16+
17+
namespace BT::Scripting
18+
{
19+
20+
namespace
21+
{
22+
23+
bool isIdentStart(char c)
24+
{
25+
return std::isalpha(static_cast<unsigned char>(c)) || c == '_' || c == '@';
26+
}
27+
28+
bool isIdentChar(char c)
29+
{
30+
return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
31+
}
32+
33+
bool isDigit(char c)
34+
{
35+
return std::isdigit(static_cast<unsigned char>(c));
36+
}
37+
38+
bool isHexDigit(char c)
39+
{
40+
return std::isxdigit(static_cast<unsigned char>(c));
41+
}
42+
43+
} // namespace
44+
45+
std::vector<Token> tokenize(const std::string& source)
46+
{
47+
std::vector<Token> tokens;
48+
const size_t len = source.size();
49+
size_t i = 0;
50+
51+
while(i < len)
52+
{
53+
const char c = source[i];
54+
55+
// Skip whitespace (space, tab, newline, carriage return)
56+
if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
57+
{
58+
++i;
59+
continue;
60+
}
61+
62+
const size_t start = i;
63+
64+
// Single-quoted string literal
65+
if(c == '\'')
66+
{
67+
++i;
68+
while(i < len && source[i] != '\'')
69+
{
70+
++i;
71+
}
72+
if(i < len)
73+
{
74+
// extract content without quotes
75+
std::string_view text(&source[start + 1], i - start - 1);
76+
tokens.push_back({ TokenType::String, text, start });
77+
++i; // skip closing quote
78+
}
79+
else
80+
{
81+
std::string_view text(&source[start], i - start);
82+
tokens.push_back({ TokenType::Error, text, start });
83+
}
84+
continue;
85+
}
86+
87+
// Number literal (integer or real)
88+
if(isDigit(c))
89+
{
90+
bool is_real = false;
91+
bool has_error = false;
92+
93+
// Check for hex prefix
94+
if(c == '0' && i + 1 < len && (source[i + 1] == 'x' || source[i + 1] == 'X'))
95+
{
96+
i += 2; // skip "0x"/"0X"
97+
if(i >= len || !isHexDigit(source[i]))
98+
{
99+
has_error = true;
100+
}
101+
else
102+
{
103+
while(i < len && isHexDigit(source[i]))
104+
{
105+
++i;
106+
}
107+
}
108+
// Hex numbers don't support dot or exponent
109+
if(i < len && (source[i] == '.' || isIdentChar(source[i])))
110+
{
111+
has_error = true;
112+
while(i < len && (isIdentChar(source[i]) || source[i] == '.'))
113+
{
114+
++i;
115+
}
116+
}
117+
}
118+
else
119+
{
120+
// Decimal integer part
121+
while(i < len && isDigit(source[i]))
122+
{
123+
++i;
124+
}
125+
// Check for fractional part
126+
if(i < len && source[i] == '.')
127+
{
128+
// Distinguish from ".." (concat operator)
129+
if(i + 1 < len && source[i + 1] == '.')
130+
{
131+
// Stop here: "65.." is Integer("65") + DotDot
132+
}
133+
else if(i + 1 < len && isDigit(source[i + 1]))
134+
{
135+
is_real = true;
136+
++i; // consume '.'
137+
while(i < len && isDigit(source[i]))
138+
{
139+
++i;
140+
}
141+
}
142+
else
143+
{
144+
// "65." or "65.x" -- incomplete real
145+
has_error = true;
146+
++i; // consume the dot
147+
while(i < len && (isIdentChar(source[i]) || source[i] == '.'))
148+
{
149+
++i;
150+
}
151+
}
152+
}
153+
// Check for exponent (only for decimal numbers)
154+
if(!has_error && i < len && (source[i] == 'e' || source[i] == 'E'))
155+
{
156+
is_real = true;
157+
++i; // consume 'e'/'E'
158+
if(i < len && (source[i] == '+' || source[i] == '-'))
159+
{
160+
++i; // consume sign
161+
}
162+
if(i >= len || !isDigit(source[i]))
163+
{
164+
has_error = true;
165+
}
166+
else
167+
{
168+
while(i < len && isDigit(source[i]))
169+
{
170+
++i;
171+
}
172+
}
173+
}
174+
// Check for trailing alpha (e.g. "3foo", "65.43foo")
175+
if(!has_error && i < len && isIdentStart(source[i]))
176+
{
177+
has_error = true;
178+
while(i < len && isIdentChar(source[i]))
179+
{
180+
++i;
181+
}
182+
}
183+
}
184+
185+
std::string_view text(&source[start], i - start);
186+
if(has_error)
187+
{
188+
tokens.push_back({ TokenType::Error, text, start });
189+
}
190+
else if(is_real)
191+
{
192+
tokens.push_back({ TokenType::Real, text, start });
193+
}
194+
else
195+
{
196+
tokens.push_back({ TokenType::Integer, text, start });
197+
}
198+
continue;
199+
}
200+
201+
// Identifier or keyword (true/false)
202+
if(isIdentStart(c))
203+
{
204+
++i; // consume start character (may not be isIdentChar, e.g. '@')
205+
while(i < len && isIdentChar(source[i]))
206+
{
207+
++i;
208+
}
209+
std::string_view text(&source[start], i - start);
210+
if(text == "true" || text == "false")
211+
{
212+
tokens.push_back({ TokenType::Boolean, text, start });
213+
}
214+
else
215+
{
216+
tokens.push_back({ TokenType::Identifier, text, start });
217+
}
218+
continue;
219+
}
220+
221+
// Two-character operators (check before single-char)
222+
if(i + 1 < len)
223+
{
224+
const char next = source[i + 1];
225+
TokenType two_char_type = TokenType::Error;
226+
bool matched = true;
227+
228+
if(c == '.' && next == '.')
229+
{
230+
two_char_type = TokenType::DotDot;
231+
}
232+
else if(c == '&' && next == '&')
233+
{
234+
two_char_type = TokenType::AmpAmp;
235+
}
236+
else if(c == '|' && next == '|')
237+
{
238+
two_char_type = TokenType::PipePipe;
239+
}
240+
else if(c == '=' && next == '=')
241+
{
242+
two_char_type = TokenType::EqualEqual;
243+
}
244+
else if(c == '!' && next == '=')
245+
{
246+
two_char_type = TokenType::BangEqual;
247+
}
248+
else if(c == '<' && next == '=')
249+
{
250+
two_char_type = TokenType::LessEqual;
251+
}
252+
else if(c == '>' && next == '=')
253+
{
254+
two_char_type = TokenType::GreaterEqual;
255+
}
256+
else if(c == ':' && next == '=')
257+
{
258+
two_char_type = TokenType::ColonEqual;
259+
}
260+
else if(c == '+' && next == '=')
261+
{
262+
two_char_type = TokenType::PlusEqual;
263+
}
264+
else if(c == '-' && next == '=')
265+
{
266+
two_char_type = TokenType::MinusEqual;
267+
}
268+
else if(c == '*' && next == '=')
269+
{
270+
two_char_type = TokenType::StarEqual;
271+
}
272+
else if(c == '/' && next == '=')
273+
{
274+
two_char_type = TokenType::SlashEqual;
275+
}
276+
else
277+
{
278+
matched = false;
279+
}
280+
281+
if(matched)
282+
{
283+
std::string_view text(&source[start], 2);
284+
tokens.push_back({ two_char_type, text, start });
285+
i += 2;
286+
continue;
287+
}
288+
}
289+
290+
// Single-character operators and delimiters
291+
{
292+
TokenType type = TokenType::Error;
293+
switch(c)
294+
{
295+
case '+':
296+
type = TokenType::Plus;
297+
break;
298+
case '-':
299+
type = TokenType::Minus;
300+
break;
301+
case '*':
302+
type = TokenType::Star;
303+
break;
304+
case '/':
305+
type = TokenType::Slash;
306+
break;
307+
case '&':
308+
type = TokenType::Ampersand;
309+
break;
310+
case '|':
311+
type = TokenType::Pipe;
312+
break;
313+
case '^':
314+
type = TokenType::Caret;
315+
break;
316+
case '~':
317+
type = TokenType::Tilde;
318+
break;
319+
case '!':
320+
type = TokenType::Bang;
321+
break;
322+
case '<':
323+
type = TokenType::Less;
324+
break;
325+
case '>':
326+
type = TokenType::Greater;
327+
break;
328+
case '=':
329+
type = TokenType::Equal;
330+
break;
331+
case '?':
332+
type = TokenType::Question;
333+
break;
334+
case ':':
335+
type = TokenType::Colon;
336+
break;
337+
case '(':
338+
type = TokenType::LeftParen;
339+
break;
340+
case ')':
341+
type = TokenType::RightParen;
342+
break;
343+
case ';':
344+
type = TokenType::Semicolon;
345+
break;
346+
default:
347+
break;
348+
}
349+
std::string_view text(&source[start], 1);
350+
tokens.push_back({ type, text, start });
351+
++i;
352+
}
353+
}
354+
355+
// Sentinel
356+
tokens.push_back({ TokenType::EndOfInput, {}, i });
357+
return tokens;
358+
}
359+
360+
} // namespace BT::Scripting

0 commit comments

Comments
 (0)