1616
1717#include "tree_sitter/parser.h"
1818
19+ #include <ctype.h>
1920#include <stdio.h>
2021#include <wctype.h>
2122
2223enum TokenType {
2324 CODE_IDENTIFIER ,
25+ RAW_DATA ,
2426 WHITESPACE_NO_NEWLINE ,
2527 ERROR_SENTINEL ,
2628};
2729
30+ static inline void advance (TSLexer * lexer ) { lexer -> advance (lexer , false); }
31+
32+ static inline void skip (TSLexer * lexer ) { lexer -> advance (lexer , true); }
33+
2834static bool is_hexadecimal_character (char character ) {
2935 switch (character ) {
3036 case '0' :
@@ -67,7 +73,7 @@ static bool scan_code_identifier(TSLexer *lexer) {
6773 unsigned int const size = (sizeof (next_token_text ) / sizeof (char ) - 1 );
6874
6975 while (true) {
70- lexer -> advance (lexer , false );
76+ advance (lexer );
7177
7278 if (lexer -> lookahead == '\n' || lexer -> eof (lexer )) {
7379 lexer -> result_symbol = CODE_IDENTIFIER ;
@@ -150,6 +156,7 @@ static bool scan_whitespace_no_newline(TSLexer *lexer) {
150156 // mark_end was called, we have control over the size of a matched token.
151157 //
152158 lexer -> mark_end (lexer );
159+ lexer -> result_symbol = WHITESPACE_NO_NEWLINE ;
153160
154161 bool has_text = false;
155162
@@ -164,15 +171,14 @@ static bool scan_whitespace_no_newline(TSLexer *lexer) {
164171 case ' ' :
165172 case '\t' :
166173 has_text = true;
167- lexer -> result_symbol = WHITESPACE_NO_NEWLINE ;
168174 lexer -> mark_end (lexer );
169175
170176 break ;
171177 default :
172178 return false;
173179 };
174180
175- lexer -> advance (lexer , false );
181+ advance (lexer );
176182 }
177183}
178184
@@ -187,7 +193,60 @@ bool tree_sitter_objdump_external_scanner_scan(void *payload, TSLexer *lexer, co
187193 return false;
188194 }
189195
190- if (valid_symbols [WHITESPACE_NO_NEWLINE ]) {
196+ bool advanced_once = false;
197+
198+ if (valid_symbols [RAW_DATA ]) {
199+ while (iswspace (lexer -> lookahead ) && lexer -> lookahead != '\n' ) {
200+ skip (lexer );
201+ }
202+
203+ bool found_dot = false;
204+ uint8_t raw_data_count = 0 ;
205+
206+ // consume till newline, require at least one dot and require it to occur within the first 4 characters, and
207+ // don't parse any 0x's.
208+ while (lexer -> lookahead != '\n' ) {
209+ if (lexer -> lookahead == '.' && raw_data_count < 4 ) {
210+ found_dot = true;
211+ }
212+
213+ if (lexer -> lookahead == '0' ) {
214+ advance (lexer );
215+ advanced_once = true;
216+ raw_data_count ++ ;
217+ if (lexer -> lookahead == 'x' ) {
218+ return false;
219+ }
220+ }
221+
222+ // disallow two alphanumerics in a row
223+ if (isalnum (lexer -> lookahead )) {
224+ advance (lexer );
225+ advanced_once = true;
226+ raw_data_count ++ ;
227+ if (isalnum (lexer -> lookahead )) {
228+ return false;
229+ }
230+ }
231+
232+ advance (lexer );
233+ if (!iswspace (lexer -> lookahead )) {
234+ advanced_once = true;
235+ }
236+ raw_data_count ++ ;
237+ }
238+
239+ if (lexer -> lookahead == '\n' && found_dot && raw_data_count > 1 ) {
240+ lexer -> result_symbol = RAW_DATA ;
241+ return true;
242+ }
243+
244+ if (raw_data_count > 4 && !found_dot ) {
245+ return false;
246+ }
247+ }
248+
249+ if (valid_symbols [WHITESPACE_NO_NEWLINE ] && !advanced_once ) {
191250 return scan_whitespace_no_newline (lexer );
192251 }
193252
0 commit comments