Skip to content

Commit 9bb9bf0

Browse files
committed
feat: support raw data
1 parent 0b612cd commit 9bb9bf0

3 files changed

Lines changed: 149 additions & 4 deletions

File tree

grammar.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ module.exports = grammar(
2525

2626
externals: $ => [
2727
$.code_identifier,
28+
$.raw_data,
2829
$._whitespace_no_newline,
2930
$._error_sentinel,
3031
],
@@ -72,6 +73,7 @@ module.exports = grammar(
7273
$.machine_code_bytes,
7374
choice(
7475
$._whitespace_no_newline,
76+
$.raw_data,
7577
seq(
7678
/\s*/,
7779
choice(

src/scanner.c

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,21 @@
1616

1717
#include "tree_sitter/parser.h"
1818

19+
#include <ctype.h>
1920
#include <stdio.h>
2021
#include <wctype.h>
2122

2223
enum TokenType {
2324
CODE_IDENTIFIER,
25+
RAW_DATA,
2426
WHITESPACE_NO_NEWLINE,
2527
ERROR_SENTINEL,
2628
};
2729

30+
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
31+
32+
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
33+
2834
static bool is_hexadecimal_character(char character) {
2935
switch (character) {
3036
case '0':
@@ -67,7 +73,7 @@ static bool scan_code_identifier(TSLexer *lexer) {
6773
unsigned int const size = (sizeof(next_token_text) / sizeof(char) - 1);
6874

6975
while (true) {
70-
lexer->advance(lexer, false);
76+
advance(lexer);
7177

7278
if (lexer->lookahead == '\n' || lexer->eof(lexer)) {
7379
lexer->result_symbol = CODE_IDENTIFIER;
@@ -150,6 +156,7 @@ static bool scan_whitespace_no_newline(TSLexer *lexer) {
150156
// mark_end was called, we have control over the size of a matched token.
151157
//
152158
lexer->mark_end(lexer);
159+
lexer->result_symbol = WHITESPACE_NO_NEWLINE;
153160

154161
bool has_text = false;
155162

@@ -164,15 +171,14 @@ static bool scan_whitespace_no_newline(TSLexer *lexer) {
164171
case ' ':
165172
case '\t':
166173
has_text = true;
167-
lexer->result_symbol = WHITESPACE_NO_NEWLINE;
168174
lexer->mark_end(lexer);
169175

170176
break;
171177
default:
172178
return false;
173179
};
174180

175-
lexer->advance(lexer, false);
181+
advance(lexer);
176182
}
177183
}
178184

@@ -187,7 +193,60 @@ bool tree_sitter_objdump_external_scanner_scan(void *payload, TSLexer *lexer, co
187193
return false;
188194
}
189195

190-
if (valid_symbols[WHITESPACE_NO_NEWLINE]) {
196+
bool advanced_once = false;
197+
198+
if (valid_symbols[RAW_DATA]) {
199+
while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
200+
skip(lexer);
201+
}
202+
203+
bool found_dot = false;
204+
uint8_t raw_data_count = 0;
205+
206+
// consume till newline, require at least one dot and require it to occur within the first 4 characters, and
207+
// don't parse any 0x's.
208+
while (lexer->lookahead != '\n') {
209+
if (lexer->lookahead == '.' && raw_data_count < 4) {
210+
found_dot = true;
211+
}
212+
213+
if (lexer->lookahead == '0') {
214+
advance(lexer);
215+
advanced_once = true;
216+
raw_data_count++;
217+
if (lexer->lookahead == 'x') {
218+
return false;
219+
}
220+
}
221+
222+
// disallow two alphanumerics in a row
223+
if (isalnum(lexer->lookahead)) {
224+
advance(lexer);
225+
advanced_once = true;
226+
raw_data_count++;
227+
if (isalnum(lexer->lookahead)) {
228+
return false;
229+
}
230+
}
231+
232+
advance(lexer);
233+
if (!iswspace(lexer->lookahead)) {
234+
advanced_once = true;
235+
}
236+
raw_data_count++;
237+
}
238+
239+
if (lexer->lookahead == '\n' && found_dot && raw_data_count > 1) {
240+
lexer->result_symbol = RAW_DATA;
241+
return true;
242+
}
243+
244+
if (raw_data_count > 4 && !found_dot) {
245+
return false;
246+
}
247+
}
248+
249+
if (valid_symbols[WHITESPACE_NO_NEWLINE] && !advanced_once) {
191250
return scan_whitespace_no_newline(lexer);
192251
}
193252

test/corpus/sections.txt

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,3 +424,87 @@ Disassembly of section __libc_freeres_fn:
424424
(byte)
425425
(byte))
426426
(instruction))))
427+
428+
================================================================================
429+
Disassembly Section With Raw Data
430+
================================================================================
431+
432+
Disassembly of section .data:
433+
434+
20000000 <SystemCoreClock>:
435+
20000000: 00f42400 .$..
436+
437+
20000004 <uwTickPrio>:
438+
20000004: 00000010 ....
439+
440+
20000008 <uwTickFreq>:
441+
20000008: 00000001 ....
442+
443+
2000000c <__sglue>:
444+
2000000c: 00000000 00000003 20000200 ...........
445+
446+
20000018 <_impure_data>:
447+
20000018: 00000000 20000200 20000268 200002d0 ....... h.. ...
448+
...
449+
450+
20000064 <_impure_ptr>:
451+
20000064: 20000018 ...
452+
453+
---
454+
455+
(source
456+
(disassembly_section_label
457+
(identifier))
458+
(disassembly_section
459+
(section_address)
460+
(identifier)
461+
(memory_offset
462+
(address)
463+
(machine_code_bytes
464+
(byte))
465+
(raw_data)))
466+
(disassembly_section
467+
(section_address)
468+
(identifier)
469+
(memory_offset
470+
(address)
471+
(machine_code_bytes
472+
(byte))
473+
(raw_data)))
474+
(disassembly_section
475+
(section_address)
476+
(identifier)
477+
(memory_offset
478+
(address)
479+
(machine_code_bytes
480+
(byte))
481+
(raw_data)))
482+
(disassembly_section
483+
(section_address)
484+
(identifier)
485+
(memory_offset
486+
(address)
487+
(machine_code_bytes
488+
(byte)
489+
(byte)
490+
(byte))
491+
(raw_data)))
492+
(disassembly_section
493+
(section_address)
494+
(identifier)
495+
(memory_offset
496+
(address)
497+
(machine_code_bytes
498+
(byte)
499+
(byte)
500+
(byte)
501+
(byte))
502+
(raw_data)))
503+
(disassembly_section
504+
(section_address)
505+
(identifier)
506+
(memory_offset
507+
(address)
508+
(machine_code_bytes
509+
(byte))
510+
(raw_data))))

0 commit comments

Comments
 (0)