@@ -2294,6 +2294,80 @@ def test_scan_with_batch_size(tmp_path: Path):
22942294 assert batch .num_rows != 12
22952295
22962296
2297+ def test_dictionaries (tmp_path : Path ):
2298+ data = pa .table (
2299+ {
2300+ "id" : pa .array ([1 , 2 , 3 ]),
2301+ "dict" : pa .array (
2302+ ["foo" , "bar" , "baz" ], pa .dictionary (pa .int32 (), pa .string ())
2303+ ),
2304+ }
2305+ )
2306+ ds = lance .write_dataset (data , tmp_path )
2307+ assert ds .schema == pa .schema (
2308+ {"id" : pa .int64 (), "dict" : pa .dictionary (pa .int32 (), pa .string ())}
2309+ )
2310+ assert ds .to_table () == data
2311+
2312+ # Can insert data with new values
2313+ new_data = pa .table (
2314+ {
2315+ "id" : [4 , 5 , 6 ],
2316+ "dict" : pa .array (
2317+ ["qux" , "quux" , "corge" ], pa .dictionary (pa .int32 (), pa .string ())
2318+ ),
2319+ }
2320+ )
2321+ ds .insert (new_data )
2322+ table = ds .to_table ().combine_chunks ()
2323+ assert table == pa .table (
2324+ {
2325+ "id" : [1 , 2 , 3 , 4 , 5 , 6 ],
2326+ "dict" : pa .array (
2327+ ["foo" , "bar" , "baz" , "qux" , "quux" , "corge" ],
2328+ pa .dictionary (pa .int32 (), pa .string ()),
2329+ ),
2330+ }
2331+ )
2332+
2333+ dict_arr = table .column ("dict" ).chunk (0 )
2334+ assert dict_arr .type == pa .dictionary (pa .int32 (), pa .string ())
2335+ assert dict_arr .to_pylist () == ["foo" , "bar" , "baz" , "qux" , "quux" , "corge" ]
2336+
2337+ assert dict_arr .dictionary .to_pylist () == [
2338+ "foo" ,
2339+ "bar" ,
2340+ "baz" ,
2341+ "qux" ,
2342+ "quux" ,
2343+ "corge" ,
2344+ ]
2345+
2346+ # Can merge insert data that has even more values
2347+ new_data = pa .table (
2348+ {
2349+ "id" : [1 , 7 ],
2350+ "dict" : pa .array (
2351+ ["grault" , "garply" ], pa .dictionary (pa .int32 (), pa .string ())
2352+ ),
2353+ }
2354+ )
2355+ ds .merge_insert (
2356+ "id"
2357+ ).when_matched_update_all ().when_not_matched_insert_all ().execute (new_data )
2358+ table = ds .to_table ().combine_chunks ().sort_by ("id" )
2359+ assert table .column ("id" ).to_pylist () == [1 , 2 , 3 , 4 , 5 , 6 , 7 ]
2360+ assert table .column ("dict" ).to_pylist () == [
2361+ "grault" ,
2362+ "bar" ,
2363+ "baz" ,
2364+ "qux" ,
2365+ "quux" ,
2366+ "corge" ,
2367+ "garply" ,
2368+ ]
2369+
2370+
22972371@pytest .mark .slow
22982372def test_io_buffer_size (tmp_path : Path ):
22992373 # These cases regress deadlock issues that happen when the
0 commit comments