Skip to content

Commit aa4965d

Browse files
committed
improve data processing
1 parent 2bd0062 commit aa4965d

2 files changed

Lines changed: 25 additions & 3 deletions

File tree

loda/llm/data_preprocessing.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,22 @@ def clean_loda_code(self, program_text: str) -> str:
106106
code_lines = []
107107

108108
for line in lines:
109-
# Skip comment lines
109+
# Skip comment lines (lines that start with ;)
110110
if line.strip().startswith(';'):
111111
continue
112112
# Skip empty lines
113113
if not line.strip():
114114
continue
115-
# Add the code line
116-
code_lines.append(line.strip())
115+
116+
# Remove inline comments (everything after ; on the same line)
117+
if ';' in line:
118+
code_part = line.split(';')[0].strip()
119+
else:
120+
code_part = line.strip()
121+
122+
# Only add non-empty code lines
123+
if code_part:
124+
code_lines.append(code_part)
117125

118126
return '\n'.join(code_lines)
119127

tests/test_llm.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,20 @@ def test_clean_loda_code(self):
9797
clean_code = preprocessor.clean_loda_code(dirty_code)
9898
expected = "mov $1,$0\npow $1,2\nmov $0,$1"
9999
self.assertEqual(clean_code, expected)
100+
101+
# Test inline comment removal
102+
code_with_inline_comments = (
103+
"; Header comment\n"
104+
"add $0,1\n"
105+
"sub $0,2 ; inline comment here\n"
106+
"mul $1,3 ; another inline comment\n"
107+
"; Full line comment\n"
108+
"div $2,4\n"
109+
)
110+
111+
clean_inline = preprocessor.clean_loda_code(code_with_inline_comments)
112+
expected_inline = "add $0,1\nsub $0,2\nmul $1,3\ndiv $2,4"
113+
self.assertEqual(clean_inline, expected_inline)
100114

101115
def test_training_example_creation(self):
102116
"""Test TrainingExample creation."""

0 commit comments

Comments
 (0)