diff --git a/.gitignore b/.gitignore index 0eaf92e665..2648020ca0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ # local configurations .idea .vscode/*.log +settings.json # tests coverage .coverage diff --git a/front/admin_ui/poetry.lock b/front/admin_ui/poetry.lock index a3dfd6ea9a..8c1302705b 100644 --- a/front/admin_ui/poetry.lock +++ b/front/admin_ui/poetry.lock @@ -672,15 +672,13 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -714,6 +712,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "dill" version = "0.3.8" @@ -1519,7 +1523,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/jobs/cache_maintenance/poetry.lock b/jobs/cache_maintenance/poetry.lock index 5b52445b83..18af8e20cd 100644 --- a/jobs/cache_maintenance/poetry.lock +++ b/jobs/cache_maintenance/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1199,7 +1203,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/jobs/mongodb_migration/poetry.lock b/jobs/mongodb_migration/poetry.lock index f2896d001d..0ca39a5f2f 100644 --- a/jobs/mongodb_migration/poetry.lock +++ b/jobs/mongodb_migration/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1199,7 +1203,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/libs/libapi/poetry.lock b/libs/libapi/poetry.lock index 9e6ceb49db..3104a67e2a 100644 --- a/libs/libapi/poetry.lock +++ b/libs/libapi/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1218,7 +1222,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/libs/libcommon/poetry.lock b/libs/libcommon/poetry.lock index 17aced77f1..fa96a229b8 100644 --- a/libs/libcommon/poetry.lock +++ b/libs/libcommon/poetry.lock @@ -675,15 +675,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -717,6 +715,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -4680,4 +4684,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.1" python-versions = "3.12.11" -content-hash = "f5d6164ae9685e20255129423c595b708f9744b381dc0de71f9a48f7566db4a8" +content-hash = "f0fa8746dd596b0b1f3b4b79c89011a5475cf36617e0be32e11ba4ad7559223f" diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml index 9d45b26088..bb3249f7f0 100644 --- a/libs/libcommon/pyproject.toml +++ b/libs/libcommon/pyproject.toml @@ -10,7 +10,7 @@ python = "3.12.11" anyio = ">=3.4.0,<5" appdirs = "^1.4.4" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = { git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" } duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/admin/poetry.lock b/services/admin/poetry.lock index c083f03ec2..2b217d3eac 100644 --- a/services/admin/poetry.lock +++ b/services/admin/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1222,7 +1226,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/api/poetry.lock b/services/api/poetry.lock index 1dd0dc85fb..39a1504ab9 100644 --- a/services/api/poetry.lock +++ b/services/api/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1259,7 +1263,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/rows/poetry.lock b/services/rows/poetry.lock index f43997716f..d60eaea7b4 100644 --- a/services/rows/poetry.lock +++ b/services/rows/poetry.lock @@ -661,15 +661,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -703,6 +701,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1280,7 +1284,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/search/poetry.lock b/services/search/poetry.lock index 13f980b34a..100c02d36f 100644 --- a/services/search/poetry.lock +++ b/services/search/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1259,7 +1263,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/sse-api/poetry.lock b/services/sse-api/poetry.lock index 0234b592ea..8d6504a2bf 100644 --- a/services/sse-api/poetry.lock +++ b/services/sse-api/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1290,7 +1294,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/webhook/poetry.lock b/services/webhook/poetry.lock index bef00b2f2a..a35b511e2f 100644 --- a/services/webhook/poetry.lock +++ b/services/webhook/poetry.lock @@ -640,15 +640,13 @@ xml-validation = ["lxml (>=4,<6)"] [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -682,6 +680,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1259,7 +1263,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index 67bf5aa35e..a4929a44aa 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -983,15 +983,13 @@ files = [ [[package]] name = "datasets" -version = "4.8.4" +version = "4.8.5.dev0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.10.0" groups = ["main"] -files = [ - {file = "datasets-4.8.4-py3-none-any.whl", hash = "sha256:cdc8bee4698e549d78bf1fed6aea2eebc760b22b084f07e6fc020c6577a6ce6d"}, - {file = "datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52"}, -] +files = [] +develop = false [package.dependencies] dill = ">=0.3.0,<0.4.2" @@ -1025,6 +1023,12 @@ tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "aiohttp", "decorator", "elastics torch = ["torch"] vision = ["Pillow (>=9.4.0)"] +[package.source] +type = "git" +url = "https://github.com/huggingface/datasets" +reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" +resolved_reference = "1cacbe629f208d7ace2c92db91a67b0060e0e07b" + [[package]] name = "defusedxml" version = "0.7.1" @@ -1696,7 +1700,7 @@ anyio = ">=3.4.0,<5" appdirs = "^1.4.4" async-lru = "^2.0.5" cryptography = "^43.0.1" -datasets = "^4.8.4" +datasets = {git = "https://github.com/huggingface/datasets", rev = "1cacbe629f208d7ace2c92db91a67b0060e0e07b"} duckdb = "^1.2.2" environs = "^14.3.0" filelock = "^3.18.0" diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py index 3bf6431585..9d89a31c7f 100644 --- a/services/worker/src/worker/dtos.py +++ b/services/worker/src/worker/dtos.py @@ -256,7 +256,17 @@ class IsValidResponse(TypedDict): DatasetLibrary = Literal["mlcroissant", "webdataset", "datasets", "pandas", "dask", "polars", "lance"] DatasetFormat = Literal[ - "json", "csv", "parquet", "imagefolder", "audiofolder", "webdataset", "text", "arrow", "optimized-parquet", "lance" + "json", + "csv", + "parquet", + "imagefolder", + "audiofolder", + "webdataset", + "text", + "arrow", + "optimized-parquet", + "lance", + "agent-traces", ] ProgrammingLanguage = Literal["python"] diff --git a/services/worker/src/worker/job_runners/dataset/compatible_libraries.py b/services/worker/src/worker/job_runners/dataset/compatible_libraries.py index 5214be28b4..6440d92aef 100644 --- a/services/worker/src/worker/job_runners/dataset/compatible_libraries.py +++ b/services/worker/src/worker/job_runners/dataset/compatible_libraries.py @@ -11,7 +11,7 @@ import datasets.data_files import pyarrow.parquet as pq import yaml -from datasets import BuilderConfig, DownloadConfig +from datasets import BuilderConfig, DownloadConfig, Features from datasets.data_files import ( NON_WORDS_CHARS, DataFilesDict, @@ -29,6 +29,7 @@ _MODULE_TO_METADATA_FILE_NAMES, _PACKAGED_DATASETS_MODULES, ) +from datasets.packaged_modules.json.json import AGENT_TRACES_FEATURES from datasets.utils.metadata import MetadataConfigs from huggingface_hub import DatasetCard, DatasetCardData, HfFileSystem from libcommon.constants import LOADING_METHODS_MAX_CONFIGS @@ -889,6 +890,15 @@ def compute_compatible_libraries_response( ): formats.append("optimized-parquet") + # Agent Traces + if "json" in formats: + if infos: + for info in infos: + if "features" in info and isinstance(info["features"], dict): + if Features.from_dict(info["features"]) == AGENT_TRACES_FEATURES: + formats.append("agent-traces") + break + return DatasetCompatibleLibrariesResponse(libraries=libraries, formats=formats)