@@ -131,12 +131,60 @@ def read(self, table_name: str | list[str]) -> pd.DataFrame:
131131 return self ._read_multiple (table_name )
132132 return self ._read_single (table_name )
133133
134+ def clear_partition (self , table_name : str ) -> None :
135+ """Delete all parquet chunk files in a hive partition."""
136+ if "/" not in table_name :
137+ return
138+ base , value = table_name .split ("/" , 1 )
139+ partition_key = HIVE_PARTITION_KEYS [base ]
140+ prefix = f"{ _CACHE_ROOT } /{ _VERSION_FOLDER } /{ base } /{ partition_key } ={ value } /"
141+ paginator = self .s3_client .get_paginator ("list_objects_v2" )
142+ to_delete = []
143+ for page in paginator .paginate (Bucket = self .bucket , Prefix = prefix ):
144+ for obj in page .get ("Contents" , []):
145+ to_delete .append ({"Key" : obj ["Key" ]})
146+ for i in range (0 , len (to_delete ), 1000 ):
147+ self .s3_client .delete_objects (
148+ Bucket = self .bucket ,
149+ Delete = {"Objects" : to_delete [i : i + 1000 ]},
150+ )
151+
152+ def write_chunk (self , table_name : str , data : pd .DataFrame , chunk_idx : int ) -> None :
153+ """Append one numbered parquet chunk to a hive partition."""
154+ base , value = table_name .split ("/" , 1 )
155+ partition_key = HIVE_PARTITION_KEYS [base ]
156+ s3_key = f"{ _CACHE_ROOT } /{ _VERSION_FOLDER } /{ base } /{ partition_key } ={ value } /data_{ chunk_idx :04d} .pqt"
157+ json_key = f"{ _CACHE_ROOT } /{ _VERSION_FOLDER } /{ base } .json"
158+
159+ parquet_buffer = io .BytesIO ()
160+ table = pa .Table .from_pandas (data , preserve_index = False )
161+ float_cols = [f .name for f in table .schema if pa .types .is_floating (f .type )]
162+ dict_cols = [f .name for f in table .schema if f .name not in float_cols ]
163+ pq .write_table (
164+ table ,
165+ parquet_buffer ,
166+ compression = "zstd" ,
167+ use_dictionary = dict_cols if dict_cols else False ,
168+ column_encoding = {col : "BYTE_STREAM_SPLIT" for col in float_cols } or None ,
169+ )
170+ parquet_buffer .seek (0 )
171+ self .s3_client .put_object (Bucket = self .bucket , Key = s3_key , Body = parquet_buffer .getvalue ())
172+ logging .info (
173+ CacheLogMessage (
174+ backend = "S3Backend" , table = table_name , message = f"Stored chunk { chunk_idx } to s3://{ self .bucket } /{ s3_key } "
175+ ).to_json ()
176+ )
177+ metadata = {"columns" : data .columns .tolist ()}
178+ self .s3_client .put_object (
179+ Bucket = self .bucket , Key = json_key , Body = json .dumps (metadata )
180+ )
181+
134182 def _read_single (self , table_name : str ) -> pd .DataFrame :
135183 """Fetch a single table from S3."""
136184 if "/" in table_name :
137185 base , value = table_name .split ("/" , 1 )
138186 partition_key = HIVE_PARTITION_KEYS [base ]
139- s3_key = f"{ _CACHE_ROOT } /{ _VERSION_FOLDER } /{ base } /{ partition_key } ={ value } /data.pqt"
187+ s3_key = f"{ _CACHE_ROOT } /{ _VERSION_FOLDER } /{ base } /{ partition_key } ={ value } /data* .pqt"
140188 else :
141189 s3_key = f"{ _CACHE_ROOT } /{ _VERSION_FOLDER } /{ table_name } .pqt"
142190
@@ -315,6 +363,17 @@ def get_versions_index(self) -> list[str]:
315363 """Return the list of all available version folders from the in-memory index."""
316364 return json .loads (self ._json_store .get ("cache_versions.json" , "[]" ))
317365
366+ def clear_partition (self , table_name : str ) -> None :
367+ """Remove all chunks stored for a partitioned table."""
368+ self ._store .pop (table_name , None )
369+
370+ def write_chunk (self , table_name : str , data : pd .DataFrame , chunk_idx : int ) -> None :
371+ """Append one chunk to the in-memory store for a partitioned table."""
372+ existing = self ._store .get (table_name , pd .DataFrame ())
373+ self ._store [table_name ] = (
374+ pd .concat ([existing , data ], ignore_index = True ) if not existing .empty else data .copy ()
375+ )
376+
318377 def _read_multiple (self , table_names : list [str ]) -> pd .DataFrame :
319378 """Fetch and merge multiple tables from memory."""
320379 dfs = []
0 commit comments