From ec095b2849cf563402dbaf4af1a5689bfbdfc0bf Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Thu, 11 Jun 2026 14:45:04 +0200 Subject: [PATCH 1/9] add quality score script --- asmtransformers/scripts/quality_score.py | 59 ++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 asmtransformers/scripts/quality_score.py diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py new file mode 100644 index 0000000..20fe4da --- /dev/null +++ b/asmtransformers/scripts/quality_score.py @@ -0,0 +1,59 @@ +import sys +from pathlib import Path + +import datasets +import numpy as np + + +OUTPUT = Path('./results') + + +def map_tokens_to_df(dataset): + token = np.asarray(dataset['input_ids']).ravel() + token = token[token != 3920] # 3920 is padding token + bincount = np.bincount(token, minlength=3497) + + return { + 'len_cfg': len(token), + 'jtp_in_range': int(bincount[:512].sum()), + 'jtp_out_of_range': int(bincount[1600]), # Jump target exceeded token (from tokenizer) + 'jtp_unknown': int(bincount[3496]), # Jump Target Unknown token (from tokenizer) + } + + +def make_scorer(dataset): + + cfg_info = { + col: {'min': min(dataset[col]), 'max': max(dataset[col])} + for col in ['len_cfg', 'jtp_in_range', 'jtp_unknown', 'jtp_out_of_range'] + } + + def normalize(val, col): + min, max = cfg_info[col]['min'], cfg_info[col]['max'] + return (val - min) / (max - min) if max != min else 0 + + def add_score(example): + """ + Here weights can be added to features for their importance if this is deemed neccesairy. + """ + example['score'] = ( + normalize(example['len_cfg'], 'len_cfg') + + normalize(example['jtp_in_range'], 'jtp_in_range') + - normalize(example['jtp_out_of_range'], 'jtp_out_of_range') + - normalize(example['jtp_unknown'], 'jtp_unknown') + ) + + return example + + return add_score + + +if __name__ == '__main__': + # With mktokenizer and tokenize_dataset.py + tokenized_dataset = datasets.load_from_disk(sys.argv[1]) + + # Need two passes of `map`` because to normalize we have to know min and max of values + tokenized_dataset = tokenized_dataset.map(map_tokens_to_df) + # In this pass we determine quality_score + scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) + scored_dataset.save_to_disk(OUTPUT) From 53c7eedf6f914665796853b51ac8f61ffed745b3 Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Thu, 11 Jun 2026 14:47:57 +0200 Subject: [PATCH 2/9] format script --- asmtransformers/scripts/quality_score.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 20fe4da..21a84f4 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -50,10 +50,10 @@ def add_score(example): if __name__ == '__main__': # With mktokenizer and tokenize_dataset.py - tokenized_dataset = datasets.load_from_disk(sys.argv[1]) + tokenized_dataset = datasets.load_from_disk(sys.argv[1]) # Need two passes of `map`` because to normalize we have to know min and max of values tokenized_dataset = tokenized_dataset.map(map_tokens_to_df) # In this pass we determine quality_score - scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) + scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) scored_dataset.save_to_disk(OUTPUT) From 003935453a63c50acc4a574b0dfc8eb01fa5f5f5 Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Thu, 11 Jun 2026 14:50:01 +0200 Subject: [PATCH 3/9] remove min and max from normalize because std python buildin --- asmtransformers/scripts/quality_score.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 21a84f4..7269d32 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -29,8 +29,8 @@ def make_scorer(dataset): } def normalize(val, col): - min, max = cfg_info[col]['min'], cfg_info[col]['max'] - return (val - min) / (max - min) if max != min else 0 + mn, mx = cfg_info[col]['min'], cfg_info[col]['max'] + return (val - mn) / (mx - mn) if mx != mn else 0 def add_score(example): """ From 40452c62480387a5607e80127aa2d5d9a6933be0 Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Tue, 16 Jun 2026 14:05:17 +0200 Subject: [PATCH 4/9] lookup special tokens from tokenizer.json rather than hardcoding them --- asmtransformers/scripts/quality_score.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 7269d32..655f48d 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -1,3 +1,4 @@ +import json import sys from pathlib import Path @@ -6,18 +7,26 @@ OUTPUT = Path('./results') +# TOKEN_PATH = str('/home/lasse/Documents/gitlab-projecten/asmtransformers/asmtransformers/results/tokenizer.json') -def map_tokens_to_df(dataset): +def map_tokens_to_dataset(dataset, tokenizer_path): + with open(tokenizer_path) as f: + tokenizer = json.load(f) + jtp_out_of_range_token = tokenizer['model']['vocab']['JUMP_ADDR_EXCEEDED'] + jtp_unknown = tokenizer['model']['vocab']['UNK_JUMP_ADDR'] + minlength = max(jtp_out_of_range_token, jtp_unknown) + 1 token = np.asarray(dataset['input_ids']).ravel() - token = token[token != 3920] # 3920 is padding token - bincount = np.bincount(token, minlength=3497) + token_to_id = {t['content']: t['id'] for t in tokenizer['added_tokens']} + padding_token = token_to_id['[SEP]'] + token = token[token != padding_token] + bincount = np.bincount(token, minlength=minlength) return { 'len_cfg': len(token), 'jtp_in_range': int(bincount[:512].sum()), - 'jtp_out_of_range': int(bincount[1600]), # Jump target exceeded token (from tokenizer) - 'jtp_unknown': int(bincount[3496]), # Jump Target Unknown token (from tokenizer) + 'jtp_out_of_range': int(bincount[jtp_out_of_range_token]), # Jump target exceeded token (from tokenizer) + 'jtp_unknown': int(bincount[jtp_unknown]), # Jump Target Unknown token (from tokenizer) } @@ -50,10 +59,11 @@ def add_score(example): if __name__ == '__main__': # With mktokenizer and tokenize_dataset.py + dataset_path, tokenizer_path = sys.argv[1:] tokenized_dataset = datasets.load_from_disk(sys.argv[1]) # Need two passes of `map`` because to normalize we have to know min and max of values - tokenized_dataset = tokenized_dataset.map(map_tokens_to_df) + tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={'tokenizer_path': tokenizer_path}) # In this pass we determine quality_score scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) scored_dataset.save_to_disk(OUTPUT) From 2bb4095ebd9b09dc939faec2a96e40709e19ddff Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Tue, 16 Jun 2026 14:10:48 +0200 Subject: [PATCH 5/9] update formatting --- asmtransformers/scripts/quality_score.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 655f48d..2439438 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -1,25 +1,24 @@ -import json import sys from pathlib import Path +import json import datasets import numpy as np OUTPUT = Path('./results') -# TOKEN_PATH = str('/home/lasse/Documents/gitlab-projecten/asmtransformers/asmtransformers/results/tokenizer.json') def map_tokens_to_dataset(dataset, tokenizer_path): with open(tokenizer_path) as f: tokenizer = json.load(f) - jtp_out_of_range_token = tokenizer['model']['vocab']['JUMP_ADDR_EXCEEDED'] - jtp_unknown = tokenizer['model']['vocab']['UNK_JUMP_ADDR'] + jtp_out_of_range_token = tokenizer["model"]["vocab"]["JUMP_ADDR_EXCEEDED"] + jtp_unknown = tokenizer["model"]["vocab"]["UNK_JUMP_ADDR"] minlength = max(jtp_out_of_range_token, jtp_unknown) + 1 token = np.asarray(dataset['input_ids']).ravel() - token_to_id = {t['content']: t['id'] for t in tokenizer['added_tokens']} + token_to_id = {t["content"]: t["id"] for t in tokenizer["added_tokens"]} padding_token = token_to_id['[SEP]'] - token = token[token != padding_token] + token = token[token != padding_token] bincount = np.bincount(token, minlength=minlength) return { @@ -58,12 +57,12 @@ def add_score(example): if __name__ == '__main__': - # With mktokenizer and tokenize_dataset.py + # Make sure tokenized dataset has been made With mktokenizer and tokenize_dataset.py dataset_path, tokenizer_path = sys.argv[1:] tokenized_dataset = datasets.load_from_disk(sys.argv[1]) # Need two passes of `map`` because to normalize we have to know min and max of values - tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={'tokenizer_path': tokenizer_path}) + tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={"tokenizer_path": tokenizer_path}) # In this pass we determine quality_score scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) scored_dataset.save_to_disk(OUTPUT) From 492e1e81942f1b273f354b0d26a5093a83499260 Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Tue, 16 Jun 2026 14:17:05 +0200 Subject: [PATCH 6/9] load tokenizer outside map function --- asmtransformers/scripts/quality_score.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 2439438..63a606d 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -1,6 +1,6 @@ +import json import sys from pathlib import Path -import json import datasets import numpy as np @@ -9,16 +9,14 @@ OUTPUT = Path('./results') -def map_tokens_to_dataset(dataset, tokenizer_path): - with open(tokenizer_path) as f: - tokenizer = json.load(f) - jtp_out_of_range_token = tokenizer["model"]["vocab"]["JUMP_ADDR_EXCEEDED"] - jtp_unknown = tokenizer["model"]["vocab"]["UNK_JUMP_ADDR"] +def map_tokens_to_dataset(dataset, tokenizer): + jtp_out_of_range_token = tokenizer['model']['vocab']['JUMP_ADDR_EXCEEDED'] + jtp_unknown = tokenizer['model']['vocab']['UNK_JUMP_ADDR'] minlength = max(jtp_out_of_range_token, jtp_unknown) + 1 token = np.asarray(dataset['input_ids']).ravel() - token_to_id = {t["content"]: t["id"] for t in tokenizer["added_tokens"]} + token_to_id = {t['content']: t['id'] for t in tokenizer['added_tokens']} padding_token = token_to_id['[SEP]'] - token = token[token != padding_token] + token = token[token != padding_token] bincount = np.bincount(token, minlength=minlength) return { @@ -59,10 +57,13 @@ def add_score(example): if __name__ == '__main__': # Make sure tokenized dataset has been made With mktokenizer and tokenize_dataset.py dataset_path, tokenizer_path = sys.argv[1:] - tokenized_dataset = datasets.load_from_disk(sys.argv[1]) + with open(tokenizer_path) as f: + tokenizer = json.load(f) + + tokenized_dataset = datasets.load_from_disk(dataset_path) # Need two passes of `map`` because to normalize we have to know min and max of values - tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={"tokenizer_path": tokenizer_path}) + tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={'tokenizer': tokenizer}) # In this pass we determine quality_score scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) scored_dataset.save_to_disk(OUTPUT) From c99f1c4fede6a74f1c0fa025053a7a3f3ec4994a Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Tue, 16 Jun 2026 14:29:39 +0200 Subject: [PATCH 7/9] add more cpu to map --- asmtransformers/scripts/quality_score.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 63a606d..741db1c 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -63,7 +63,8 @@ def add_score(example): tokenized_dataset = datasets.load_from_disk(dataset_path) # Need two passes of `map`` because to normalize we have to know min and max of values - tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={'tokenizer': tokenizer}) + tokenized_dataset = tokenized_dataset.map(map_tokens_to_dataset, fn_kwargs={'tokenizer': tokenizer}, num_proc=10) + # In this pass we determine quality_score - scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset)) + scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset), num_proc=10) scored_dataset.save_to_disk(OUTPUT) From 4a9674b051dd9ab4dfcbc96671b8b0c4b519cc3c Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Tue, 16 Jun 2026 14:52:04 +0200 Subject: [PATCH 8/9] change sep to padding --- asmtransformers/scripts/quality_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 741db1c..98f8991 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -15,7 +15,7 @@ def map_tokens_to_dataset(dataset, tokenizer): minlength = max(jtp_out_of_range_token, jtp_unknown) + 1 token = np.asarray(dataset['input_ids']).ravel() token_to_id = {t['content']: t['id'] for t in tokenizer['added_tokens']} - padding_token = token_to_id['[SEP]'] + padding_token = token_to_id['[PAD]'] token = token[token != padding_token] bincount = np.bincount(token, minlength=minlength) From 2035333c44ed14112f46c5a662ee786bf944a9c2 Mon Sep 17 00:00:00 2001 From: Lasse Vulto Date: Tue, 16 Jun 2026 14:54:43 +0200 Subject: [PATCH 9/9] output path from argv --- asmtransformers/scripts/quality_score.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/asmtransformers/scripts/quality_score.py b/asmtransformers/scripts/quality_score.py index 98f8991..461985e 100644 --- a/asmtransformers/scripts/quality_score.py +++ b/asmtransformers/scripts/quality_score.py @@ -1,14 +1,10 @@ import json import sys -from pathlib import Path import datasets import numpy as np -OUTPUT = Path('./results') - - def map_tokens_to_dataset(dataset, tokenizer): jtp_out_of_range_token = tokenizer['model']['vocab']['JUMP_ADDR_EXCEEDED'] jtp_unknown = tokenizer['model']['vocab']['UNK_JUMP_ADDR'] @@ -56,7 +52,7 @@ def add_score(example): if __name__ == '__main__': # Make sure tokenized dataset has been made With mktokenizer and tokenize_dataset.py - dataset_path, tokenizer_path = sys.argv[1:] + dataset_path, tokenizer_path, output_path = sys.argv[1:] with open(tokenizer_path) as f: tokenizer = json.load(f) @@ -67,4 +63,4 @@ def add_score(example): # In this pass we determine quality_score scored_dataset = tokenized_dataset.map(make_scorer(tokenized_dataset), num_proc=10) - scored_dataset.save_to_disk(OUTPUT) + scored_dataset.save_to_disk(output_path)