feat: support raw data

amaanq · amaanq · commit 9bb9bf0ffbf4 · 2023-11-19T14:06:16.000-05:00
diff --git a/grammar.js b/grammar.js
@@ -25,6 +25,7 @@ module.exports = grammar(
 
         externals: $ => [
             $.code_identifier,
+            $.raw_data,
             $._whitespace_no_newline,
             $._error_sentinel,
         ],
@@ -72,6 +73,7 @@ module.exports = grammar(
                 $.machine_code_bytes,
                 choice(
                     $._whitespace_no_newline,
+                    $.raw_data,
                     seq(
                         /\s*/,
                         choice(
diff --git a/src/scanner.c b/src/scanner.c
@@ -16,15 +16,21 @@
 
 #include "tree_sitter/parser.h"
 
+#include <ctype.h>
 #include <stdio.h>
 #include <wctype.h>
 
 enum TokenType {
     CODE_IDENTIFIER,
+    RAW_DATA,
     WHITESPACE_NO_NEWLINE,
     ERROR_SENTINEL,
 };
 
+static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+
+static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
 static bool is_hexadecimal_character(char character) {
     switch (character) {
         case '0':
@@ -67,7 +73,7 @@ static bool scan_code_identifier(TSLexer *lexer) {
     unsigned int const size = (sizeof(next_token_text) / sizeof(char) - 1);
 
     while (true) {
-        lexer->advance(lexer, false);
+        advance(lexer);
 
         if (lexer->lookahead == '\n' || lexer->eof(lexer)) {
             lexer->result_symbol = CODE_IDENTIFIER;
@@ -150,6 +156,7 @@ static bool scan_whitespace_no_newline(TSLexer *lexer) {
     // mark_end was called, we have control over the size of a matched token.
     //
     lexer->mark_end(lexer);
+    lexer->result_symbol = WHITESPACE_NO_NEWLINE;
 
     bool has_text = false;
 
@@ -164,15 +171,14 @@ static bool scan_whitespace_no_newline(TSLexer *lexer) {
             case ' ':
             case '\t':
                 has_text = true;
-                lexer->result_symbol = WHITESPACE_NO_NEWLINE;
                 lexer->mark_end(lexer);
 
                 break;
             default:
                 return false;
         };
 
-        lexer->advance(lexer, false);
+        advance(lexer);
     }
 }
 
@@ -187,7 +193,60 @@ bool tree_sitter_objdump_external_scanner_scan(void *payload, TSLexer *lexer, co
         return false;
     }
 
-    if (valid_symbols[WHITESPACE_NO_NEWLINE]) {
+    bool advanced_once = false;
+
+    if (valid_symbols[RAW_DATA]) {
+        while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
+            skip(lexer);
+        }
+
+        bool found_dot = false;
+        uint8_t raw_data_count = 0;
+
+        // consume till newline, require at least one dot and require it to occur within the first 4 characters, and
+        // don't parse any 0x's.
+        while (lexer->lookahead != '\n') {
+            if (lexer->lookahead == '.' && raw_data_count < 4) {
+                found_dot = true;
+            }
+
+            if (lexer->lookahead == '0') {
+                advance(lexer);
+                advanced_once = true;
+                raw_data_count++;
+                if (lexer->lookahead == 'x') {
+                    return false;
+                }
+            }
+
+            // disallow two alphanumerics in a row
+            if (isalnum(lexer->lookahead)) {
+                advance(lexer);
+                advanced_once = true;
+                raw_data_count++;
+                if (isalnum(lexer->lookahead)) {
+                    return false;
+                }
+            }
+
+            advance(lexer);
+            if (!iswspace(lexer->lookahead)) {
+                advanced_once = true;
+            }
+            raw_data_count++;
+        }
+
+        if (lexer->lookahead == '\n' && found_dot && raw_data_count > 1) {
+            lexer->result_symbol = RAW_DATA;
+            return true;
+        }
+
+        if (raw_data_count > 4 && !found_dot) {
+            return false;
+        }
+    }
+
+    if (valid_symbols[WHITESPACE_NO_NEWLINE] && !advanced_once) {
         return scan_whitespace_no_newline(lexer);
     }
 
diff --git a/test/corpus/sections.txt b/test/corpus/sections.txt
@@ -424,3 +424,87 @@ Disassembly of section __libc_freeres_fn:
         (byte)
         (byte))
       (instruction))))
+
+================================================================================
+Disassembly Section With Raw Data
+================================================================================
+
+Disassembly of section .data:
+
+20000000 <SystemCoreClock>:
+20000000:	00f42400                                .$..
+
+20000004 <uwTickPrio>:
+20000004:	00000010                                ....
+
+20000008 <uwTickFreq>:
+20000008:	00000001                                ....
+
+2000000c <__sglue>:
+2000000c:	00000000 00000003 20000200              ........... 
+
+20000018 <_impure_data>:
+20000018:	00000000 20000200 20000268 200002d0     ....... h.. ... 
+	...
+
+20000064 <_impure_ptr>:
+20000064:	20000018                                ...
+
+---
+
+(source
+  (disassembly_section_label
+    (identifier))
+  (disassembly_section
+    (section_address)
+    (identifier)
+    (memory_offset
+      (address)
+      (machine_code_bytes
+  (byte))
+      (raw_data)))
+  (disassembly_section
+    (section_address)
+    (identifier)
+    (memory_offset
+      (address)
+      (machine_code_bytes
+        (byte))
+      (raw_data)))
+  (disassembly_section
+    (section_address)
+    (identifier)
+    (memory_offset
+      (address)
+      (machine_code_bytes
+        (byte))
+      (raw_data)))
+  (disassembly_section
+    (section_address)
+    (identifier)
+    (memory_offset
+      (address)
+      (machine_code_bytes
+        (byte)
+        (byte)
+        (byte))
+      (raw_data)))
+  (disassembly_section
+    (section_address)
+    (identifier)
+    (memory_offset
+      (address)
+      (machine_code_bytes
+        (byte)
+        (byte)
+        (byte)
+        (byte))
+      (raw_data)))
+  (disassembly_section
+    (section_address)
+    (identifier)
+    (memory_offset
+      (address)
+      (machine_code_bytes
+        (byte))
+      (raw_data))))