Skip to content

Commit 2124ba0

Browse files
feat(splitter): add Bash, CMake, and HCL tree-sitter support
1 parent 4421582 commit 2124ba0

3 files changed

Lines changed: 78 additions & 3 deletions

File tree

python/tests/ops/test_text.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ def test_detect_code_language_known_extensions() -> None:
1919
assert detect_code_language(filename="App.vue") == "vue"
2020
assert detect_code_language(filename="script.jl") == "julia"
2121
assert detect_code_language(filename="Main.elm") == "elm"
22+
assert detect_code_language(filename="deploy.sh") == "bash"
23+
assert detect_code_language(filename="CMakeLists.cmake") == "cmake"
24+
assert detect_code_language(filename="main.tf") == "hcl"
2225

2326

2427
def test_detect_code_language_unknown_extension() -> None:
@@ -275,3 +278,60 @@ def test_custom_language_config_alias() -> None:
275278
assert len(chunks) == 2
276279
assert chunks[0].text == "PartA"
277280
assert chunks[1].text == "PartB"
281+
282+
283+
def test_recursive_splitter_with_bash() -> None:
284+
"""Test RecursiveSplitter with Bash syntax-aware splitting."""
285+
splitter = RecursiveSplitter()
286+
code = (
287+
"#!/usr/bin/env bash\n\n"
288+
"greet() {\n"
289+
' echo "Hello, $1!"\n'
290+
"}\n\n"
291+
"for name in Alice Bob; do\n"
292+
" greet \"$name\"\n"
293+
"done\n"
294+
)
295+
chunks = splitter.split(code, chunk_size=60, min_chunk_size=20, language="bash")
296+
297+
assert len(chunks) >= 1
298+
assert all(isinstance(c, Chunk) for c in chunks)
299+
300+
301+
def test_recursive_splitter_with_cmake() -> None:
302+
"""Test RecursiveSplitter with CMake syntax-aware splitting."""
303+
splitter = RecursiveSplitter()
304+
code = (
305+
"cmake_minimum_required(VERSION 3.20)\n"
306+
"project(MyProject)\n\n"
307+
"function(add_my_target name)\n"
308+
" add_executable(${name} main.cpp)\n"
309+
" target_compile_features(${name} PRIVATE cxx_std_17)\n"
310+
"endfunction()\n\n"
311+
"add_my_target(app)\n"
312+
)
313+
chunks = splitter.split(code, chunk_size=80, min_chunk_size=20, language="cmake")
314+
315+
assert len(chunks) >= 1
316+
assert all(isinstance(c, Chunk) for c in chunks)
317+
318+
319+
def test_recursive_splitter_with_hcl() -> None:
320+
"""Test RecursiveSplitter with HCL/Terraform syntax-aware splitting."""
321+
splitter = RecursiveSplitter()
322+
code = (
323+
'terraform {\n required_version = ">= 1.0"\n}\n\n'
324+
'resource "aws_s3_bucket" "example" {\n'
325+
' bucket = "my-bucket"\n\n'
326+
" tags = {\n"
327+
' Name = "example"\n'
328+
" }\n"
329+
"}\n\n"
330+
'output "bucket_name" {\n'
331+
' value = aws_s3_bucket.example.bucket\n'
332+
"}\n"
333+
)
334+
chunks = splitter.split(code, chunk_size=80, min_chunk_size=20, language="hcl")
335+
336+
assert len(chunks) >= 1
337+
assert all(isinstance(c, Chunk) for c in chunks)

rust/ops_text/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ tree-sitter-typescript = "0.23.2"
4242
tree-sitter-xml = "0.7.0"
4343
tree-sitter-yaml = "0.7.2"
4444
tree-sitter-elm = "5.9.0"
45+
tree-sitter-bash = "0.25.1"
46+
tree-sitter-cmake = "0.7.1"
47+
tree-sitter-hcl = "1.1.0"
4548
tree-sitter-solidity = "1.2.13"
4649
tree-sitter-svelte-ng = "1.0.2"
4750
tree-sitter-vue-next = "0.1.0"

rust/ops_text/src/prog_langs.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ static LANGUAGE_INFO_BY_NAME: LazyLock<
7676
add("arduino", &[".ino"], None);
7777
add("asm", &[".asm", ".a51", ".i", ".nas", ".nasm", ".s"], None);
7878
add("astro", &[".astro"], None);
79-
add("bash", &[".sh", ".bash"], None);
79+
add(
80+
"bash",
81+
&[".sh", ".bash"],
82+
Some(TreeSitterLanguageInfo::new(tree_sitter_bash::LANGUAGE, [])),
83+
);
8084
add("beancount", &[".beancount"], None);
8185
add("bibtex", &[".bib", ".bibtex"], None);
8286
add("bicep", &[".bicep", ".bicepparam"], None);
@@ -97,7 +101,11 @@ static LANGUAGE_INFO_BY_NAME: LazyLock<
97101
],
98102
None,
99103
);
100-
add("cmake", &[".cmake", ".cmake.in"], None);
104+
add(
105+
"cmake",
106+
&[".cmake", ".cmake.in"],
107+
Some(TreeSitterLanguageInfo::new(tree_sitter_cmake::LANGUAGE, [])),
108+
);
101109
add(
102110
"commonlisp",
103111
&[
@@ -191,7 +199,11 @@ static LANGUAGE_INFO_BY_NAME: LazyLock<
191199
add("hare", &[".ha"], None);
192200
add("haskell", &[".hs", ".hs-boot", ".hsc"], None);
193201
add("haxe", &[".hx"], None);
194-
add("hcl", &[".hcl", ".tf"], None);
202+
add(
203+
"hcl",
204+
&[".hcl", ".tf"],
205+
Some(TreeSitterLanguageInfo::new(tree_sitter_hcl::LANGUAGE, [])),
206+
);
195207
add("heex", &[".heex"], None);
196208
add("hlsl", &[".hlsl"], None);
197209
add(

0 commit comments

Comments
 (0)