Skip to content

Commit 1e41c88

Browse files
committed
accept pathlib.Path, os.PathLike, bytes, and file-like objects in read_parquet/from_parquet
1 parent 50d2b28 commit 1e41c88

8 files changed

Lines changed: 272 additions & 215 deletions

File tree

_duckdb-stubs/__init__.pyi

Lines changed: 28 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -309,22 +309,15 @@ class DuckDBPyConnection:
309309
strict_mode: bool | None = None,
310310
) -> DuckDBPyRelation: ...
311311
def from_df(self, df: pandas.DataFrame) -> DuckDBPyRelation: ...
312-
@typing.overload
313-
def from_parquet(
314-
self,
315-
file_glob: str,
316-
binary_as_string: bool = False,
317-
*,
318-
file_row_number: bool = False,
319-
filename: bool = False,
320-
hive_partitioning: bool = False,
321-
union_by_name: bool = False,
322-
compression: ParquetCompression | None = None,
323-
) -> DuckDBPyRelation: ...
324-
@typing.overload
325312
def from_parquet(
326313
self,
327-
file_globs: Sequence[str],
314+
path_or_buffer: str
315+
| bytes
316+
| os.PathLike[str]
317+
| os.PathLike[bytes]
318+
| typing.IO[bytes]
319+
| typing.IO[str]
320+
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
328321
binary_as_string: bool = False,
329322
*,
330323
file_row_number: bool = False,
@@ -433,22 +426,15 @@ class DuckDBPyConnection:
433426
hive_types: HiveTypes | None = None,
434427
hive_types_autocast: bool | None = None,
435428
) -> DuckDBPyRelation: ...
436-
@typing.overload
437-
def read_parquet(
438-
self,
439-
file_glob: str,
440-
binary_as_string: bool = False,
441-
*,
442-
file_row_number: bool = False,
443-
filename: bool = False,
444-
hive_partitioning: bool = False,
445-
union_by_name: bool = False,
446-
compression: ParquetCompression | None = None,
447-
) -> DuckDBPyRelation: ...
448-
@typing.overload
449429
def read_parquet(
450430
self,
451-
file_globs: Sequence[str],
431+
path_or_buffer: str
432+
| bytes
433+
| os.PathLike[str]
434+
| os.PathLike[bytes]
435+
| typing.IO[bytes]
436+
| typing.IO[str]
437+
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
452438
binary_as_string: bool = False,
453439
*,
454440
file_row_number: bool = False,
@@ -1061,21 +1047,14 @@ def from_csv_auto(
10611047
strict_mode: bool | None = None,
10621048
) -> DuckDBPyRelation: ...
10631049
def from_df(df: pandas.DataFrame, *, connection: DuckDBPyConnection | None = None) -> DuckDBPyRelation: ...
1064-
@typing.overload
1065-
def from_parquet(
1066-
file_glob: str,
1067-
binary_as_string: bool = False,
1068-
*,
1069-
file_row_number: bool = False,
1070-
filename: bool = False,
1071-
hive_partitioning: bool = False,
1072-
union_by_name: bool = False,
1073-
compression: ParquetCompression | None = None,
1074-
connection: DuckDBPyConnection | None = None,
1075-
) -> DuckDBPyRelation: ...
1076-
@typing.overload
10771050
def from_parquet(
1078-
file_globs: Sequence[str],
1051+
path_or_buffer: str
1052+
| bytes
1053+
| os.PathLike[str]
1054+
| os.PathLike[bytes]
1055+
| typing.IO[bytes]
1056+
| typing.IO[str]
1057+
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
10791058
binary_as_string: bool = False,
10801059
*,
10811060
file_row_number: bool = False,
@@ -1232,21 +1211,14 @@ def read_json(
12321211
hive_types: HiveTypes | None = None,
12331212
hive_types_autocast: bool | None = None,
12341213
) -> DuckDBPyRelation: ...
1235-
@typing.overload
1236-
def read_parquet(
1237-
file_glob: str,
1238-
binary_as_string: bool = False,
1239-
*,
1240-
file_row_number: bool = False,
1241-
filename: bool = False,
1242-
hive_partitioning: bool = False,
1243-
union_by_name: bool = False,
1244-
compression: ParquetCompression | None = None,
1245-
connection: DuckDBPyConnection | None = None,
1246-
) -> DuckDBPyRelation: ...
1247-
@typing.overload
12481214
def read_parquet(
1249-
file_globs: Sequence[str],
1215+
path_or_buffer: str
1216+
| bytes
1217+
| os.PathLike[str]
1218+
| os.PathLike[bytes]
1219+
| typing.IO[bytes]
1220+
| typing.IO[str]
1221+
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
12501222
binary_as_string: bool = False,
12511223
*,
12521224
file_row_number: bool = False,

scripts/connection_methods.json

Lines changed: 3 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -941,11 +941,11 @@
941941
"read_parquet"
942942
],
943943
"function": "FromParquet",
944-
"docs": "Create a relation object from the Parquet files in file_glob",
944+
"docs": "Create a relation object from the Parquet path(s) or file-like object(s) in 'path_or_buffer'",
945945
"args": [
946946
{
947-
"name": "file_glob",
948-
"type": "str"
947+
"name": "path_or_buffer",
948+
"type": "Union[str, bytes, os.PathLike, IO[bytes], IO[str], Sequence[Union[str, bytes, os.PathLike, IO[bytes], IO[str]]]]"
949949
},
950950
{
951951
"name": "binary_as_string",
@@ -982,53 +982,6 @@
982982
],
983983
"return": "DuckDBPyRelation"
984984
},
985-
{
986-
"name": [
987-
"from_parquet",
988-
"read_parquet"
989-
],
990-
"function": "FromParquets",
991-
"docs": "Create a relation object from the Parquet files in file_globs",
992-
"args": [
993-
{
994-
"name": "file_globs",
995-
"type": "List[str]"
996-
},
997-
{
998-
"name": "binary_as_string",
999-
"default": "False",
1000-
"type": "bool"
1001-
}
1002-
],
1003-
"kwargs": [
1004-
{
1005-
"name": "file_row_number",
1006-
"default": "False",
1007-
"type": "bool"
1008-
},
1009-
{
1010-
"name": "filename",
1011-
"default": "False",
1012-
"type": "bool"
1013-
},
1014-
{
1015-
"name": "hive_partitioning",
1016-
"default": "False",
1017-
"type": "bool"
1018-
},
1019-
{
1020-
"name": "union_by_name",
1021-
"default": "False",
1022-
"type": "bool"
1023-
},
1024-
{
1025-
"name": "compression",
1026-
"default": "None",
1027-
"type": "str"
1028-
}
1029-
],
1030-
"return": "DuckDBPyRelation"
1031-
},
1032985
{
1033986
"name": "get_table_names",
1034987
"function": "GetTableNames",

src/duckdb_py/duckdb_python.cpp

Lines changed: 14 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -748,64 +748,34 @@ static void InitializeConnectionMethods(py::module_ &m) {
748748
py::arg("connection") = py::none());
749749
m.def(
750750
"from_parquet",
751-
[](const string &file_glob, bool binary_as_string, bool file_row_number, bool filename, bool hive_partitioning,
752-
bool union_by_name, const py::object &compression = py::none(),
753-
shared_ptr<DuckDBPyConnection> conn = nullptr) {
754-
if (!conn) {
755-
conn = DuckDBPyConnection::DefaultConnection();
756-
}
757-
return conn->FromParquet(file_glob, binary_as_string, file_row_number, filename, hive_partitioning,
758-
union_by_name, compression);
759-
},
760-
"Create a relation object from the Parquet files in file_glob", py::arg("file_glob"),
761-
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
762-
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
763-
py::arg("compression") = py::none(), py::arg("connection") = py::none());
764-
m.def(
765-
"read_parquet",
766-
[](const string &file_glob, bool binary_as_string, bool file_row_number, bool filename, bool hive_partitioning,
767-
bool union_by_name, const py::object &compression = py::none(),
768-
shared_ptr<DuckDBPyConnection> conn = nullptr) {
769-
if (!conn) {
770-
conn = DuckDBPyConnection::DefaultConnection();
771-
}
772-
return conn->FromParquet(file_glob, binary_as_string, file_row_number, filename, hive_partitioning,
773-
union_by_name, compression);
774-
},
775-
"Create a relation object from the Parquet files in file_glob", py::arg("file_glob"),
776-
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
777-
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
778-
py::arg("compression") = py::none(), py::arg("connection") = py::none());
779-
m.def(
780-
"from_parquet",
781-
[](const vector<string> &file_globs, bool binary_as_string, bool file_row_number, bool filename,
751+
[](const py::object &path_or_buffer, bool binary_as_string, bool file_row_number, bool filename,
782752
bool hive_partitioning, bool union_by_name, const py::object &compression = py::none(),
783753
shared_ptr<DuckDBPyConnection> conn = nullptr) {
784754
if (!conn) {
785755
conn = DuckDBPyConnection::DefaultConnection();
786756
}
787-
return conn->FromParquets(file_globs, binary_as_string, file_row_number, filename, hive_partitioning,
788-
union_by_name, compression);
757+
return conn->FromParquet(path_or_buffer, binary_as_string, file_row_number, filename, hive_partitioning,
758+
union_by_name, compression);
789759
},
790-
"Create a relation object from the Parquet files in file_globs", py::arg("file_globs"),
791-
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
792-
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
793-
py::arg("compression") = py::none(), py::arg("connection") = py::none());
760+
"Create a relation object from the Parquet path(s) or file-like object(s) in 'path_or_buffer'",
761+
py::arg("path_or_buffer"), py::arg("binary_as_string") = false, py::kw_only(),
762+
py::arg("file_row_number") = false, py::arg("filename") = false, py::arg("hive_partitioning") = false,
763+
py::arg("union_by_name") = false, py::arg("compression") = py::none(), py::arg("connection") = py::none());
794764
m.def(
795765
"read_parquet",
796-
[](const vector<string> &file_globs, bool binary_as_string, bool file_row_number, bool filename,
766+
[](const py::object &path_or_buffer, bool binary_as_string, bool file_row_number, bool filename,
797767
bool hive_partitioning, bool union_by_name, const py::object &compression = py::none(),
798768
shared_ptr<DuckDBPyConnection> conn = nullptr) {
799769
if (!conn) {
800770
conn = DuckDBPyConnection::DefaultConnection();
801771
}
802-
return conn->FromParquets(file_globs, binary_as_string, file_row_number, filename, hive_partitioning,
803-
union_by_name, compression);
772+
return conn->FromParquet(path_or_buffer, binary_as_string, file_row_number, filename, hive_partitioning,
773+
union_by_name, compression);
804774
},
805-
"Create a relation object from the Parquet files in file_globs", py::arg("file_globs"),
806-
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
807-
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
808-
py::arg("compression") = py::none(), py::arg("connection") = py::none());
775+
"Create a relation object from the Parquet path(s) or file-like object(s) in 'path_or_buffer'",
776+
py::arg("path_or_buffer"), py::arg("binary_as_string") = false, py::kw_only(),
777+
py::arg("file_row_number") = false, py::arg("filename") = false, py::arg("hive_partitioning") = false,
778+
py::arg("union_by_name") = false, py::arg("compression") = py::none(), py::arg("connection") = py::none());
809779
m.def(
810780
"get_table_names",
811781
[](const string &query, bool qualified, shared_ptr<DuckDBPyConnection> conn = nullptr) {

src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -293,16 +293,9 @@ struct DuckDBPyConnection : public enable_shared_from_this<DuckDBPyConnection> {
293293

294294
unique_ptr<DuckDBPyRelation> FromDF(const PandasDataFrame &value);
295295

296-
unique_ptr<DuckDBPyRelation> FromParquet(const string &file_glob, bool binary_as_string, bool file_row_number,
297-
bool filename, bool hive_partitioning, bool union_by_name,
298-
const py::object &compression = py::none());
299-
unique_ptr<DuckDBPyRelation> FromParquets(const vector<string> &file_globs, bool binary_as_string,
300-
bool file_row_number, bool filename, bool hive_partitioning,
301-
bool union_by_name, const py::object &compression = py::none());
302-
303-
unique_ptr<DuckDBPyRelation> FromParquetInternal(Value &&file_param, bool binary_as_string, bool file_row_number,
304-
bool filename, bool hive_partitioning, bool union_by_name,
305-
const py::object &compression = py::none());
296+
unique_ptr<DuckDBPyRelation> FromParquet(const py::object &path_or_buffer, bool binary_as_string,
297+
bool file_row_number, bool filename, bool hive_partitioning,
298+
bool union_by_name, const py::object &compression = py::none());
306299

307300
unique_ptr<DuckDBPyRelation> FromArrow(py::object &arrow_object);
308301

src/duckdb_py/path_like.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@ namespace duckdb {
1010

1111
struct PathLikeProcessor {
1212
public:
13-
PathLikeProcessor(DuckDBPyConnection &connection, PythonImportCache &import_cache)
14-
: connection(connection), import_cache(import_cache) {
13+
explicit PathLikeProcessor(DuckDBPyConnection &connection) : connection(connection) {
1514
}
1615

1716
public:
@@ -29,7 +28,6 @@ struct PathLikeProcessor {
2928
public:
3029
DuckDBPyConnection &connection;
3130
optional_ptr<ModifiedMemoryFileSystem> object_store;
32-
PythonImportCache &import_cache;
3331
// The list containing every file
3432
vector<string> all_files;
3533
// The list of files that are registered in the object_store;
@@ -41,8 +39,10 @@ void PathLikeProcessor::AddFile(const py::object &object) {
4139
all_files.push_back(std::string(py::str(object)));
4240
return;
4341
}
44-
if (py::isinstance(object, import_cache.pathlib.Path())) {
45-
all_files.push_back(std::string(py::str(object)));
42+
if (py::isinstance<py::bytes>(object) || py::hasattr(object, "__fspath__")) {
43+
// A bytes path or an os.PathLike object (e.g. pathlib.Path) - decode it to a string
44+
auto fsdecode = py::module_::import("os").attr("fsdecode");
45+
all_files.push_back(std::string(py::str(fsdecode(object))));
4646
return;
4747
}
4848
// This is (assumed to be) a file-like object
@@ -79,9 +79,7 @@ PathLike PathLikeProcessor::Finalize() {
7979
}
8080

8181
PathLike PathLike::Create(const py::object &object, DuckDBPyConnection &connection) {
82-
auto &import_cache = *DuckDBPyConnection::ImportCache();
83-
84-
PathLikeProcessor processor(connection, import_cache);
82+
PathLikeProcessor processor(connection);
8583
if (py::isinstance<py::list>(object)) {
8684
auto list = py::list(object);
8785
for (auto &item : list) {

0 commit comments

Comments
 (0)