1313 ColumnDefinition ,
1414 DatasetGroup ,
1515 Origin ,
16+ file_columns ,
1617)
1718from pysus .api .extensions import Parquet
1819from pysus .api .ftp .models import File as FTPFile
1920from pysus .api .models import BaseRemoteFile
21+ from sqlalchemy import delete
2022
2123
2224class CatalogManager :
@@ -59,6 +61,7 @@ async def upload(
5961 ) -> None :
6062 if not self .pysus ._ducklake :
6163 raise ConnectionError ("DuckLake is not connected" )
64+
6265 with self .pysus ._ducklake ._Session () as session :
6366 dataset = self ._get_or_create_dataset (session , file )
6467 group = self ._get_or_create_group (session , file , dataset )
@@ -67,7 +70,7 @@ async def upload(
6770 if not self ._should_upload (file , cat_file ):
6871 return
6972
70- session .commit ()
73+ session .flush ()
7174
7275 parquet_ext = await self .pysus .download_to_parquet (
7376 file = file , token = self .dadosgov_token , callback = callback
@@ -78,25 +81,37 @@ async def upload(
7881 f"/{ file .dataset .name .lower ()} /{ parquet_ext .path .name } "
7982 )
8083
81- await self ._upload_to_s3 (parquet_ext .path , s3_key )
82-
8384 with self .pysus ._ducklake ._Session () as session :
84- current_dataset = self ._get_or_create_dataset (session , file )
85- current_group = self ._get_or_create_group (
86- session , file , current_dataset
85+ existing_conflict = (
86+ session .query (CatalogFile )
87+ .filter (
88+ CatalogFile .path == s3_key ,
89+ CatalogFile .dataset_id == dataset .id ,
90+ )
91+ .first ()
8792 )
8893
89- cat_file = self ._get_or_create_file (
90- session , file , current_dataset , current_group
91- )
94+ if existing_conflict and existing_conflict .id != cat_file .id :
95+ session .execute (
96+ delete (file_columns ).where (
97+ file_columns .c .file_id == existing_conflict .id
98+ )
99+ )
100+ session .flush ()
101+ session .delete (existing_conflict )
102+ session .flush ()
103+
104+ cat_file = session .merge (cat_file )
105+
106+ await self ._upload_to_s3 (parquet_ext .path , s3_key )
92107
93108 cat_file .path = s3_key
94109 cat_file .size = parquet_ext .size
95110 cat_file .rows = parquet_ext .rows
96111 cat_file .modified = datetime .utcnow ()
97112 cat_file .origin_modified = file .modify
98113 cat_file .columns = self ._get_or_create_columns (
99- session , current_dataset , parquet_ext
114+ session , dataset , parquet_ext
100115 )
101116
102117 session .commit ()
@@ -140,9 +155,8 @@ def _get_or_create_dataset(
140155 ds_name = file .dataset .name .lower ()
141156 ds = session .query (CatalogDataset ).filter_by (name = ds_name ).first ()
142157 if not ds :
143- origin = (
144- Origin .FTP if file .client .name .lower () == "ftp" else Origin .API
145- )
158+ is_ftp = file .client .name .lower () == "ftp"
159+ origin = Origin .FTP if is_ftp else Origin .API
146160 ds = CatalogDataset (
147161 name = ds_name , long_name = file .dataset .long_name , origin = origin
148162 )
@@ -201,7 +215,7 @@ def _get_or_create_file(
201215 size = 0 ,
202216 rows = 0 ,
203217 modified = datetime .min ,
204- origin_path = file .path ,
218+ origin_path = str ( file .path ) ,
205219 year = file .year ,
206220 month = file .month ,
207221 state = file .state ,
0 commit comments