Skip to content

Commit 6b0f547

Browse files
authored
Fix migration of assets with duplicate filenames and improve model logging (#19)
* Fix migration of assets with duplicate filenames The copy phase used a dict keyed by fileName to store asset metadata, causing duplicate-named assets to be silently dropped. The download phase also saved duplicate-named non-remote assets to the same file path, overwriting earlier ones. Change assets_metadata from a dict to a list so all entries are preserved, and assign unique diskFileName values during download for non-remote assets with colliding names. * Fix model logging Updates code to use log_model/log_remote_model to log model assets, rather than generically log_asset/log_remote_asset. This ensures that the model elements roll into the right parent model, and that the models can be registered to the registry if desired. * Address PR Review Feedback Addressing Doug's comments in the review
1 parent 59af61f commit 6b0f547

2 files changed

Lines changed: 96 additions & 66 deletions

File tree

cometx/cli/copy.py

Lines changed: 60 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,18 +1196,17 @@ def update_datagrid_contents(
11961196
asset_map[old_asset_id] = result["assetId"]
11971197

11981198
def _log_asset(
1199-
self, experiment, path, asset_type, log_filename, assets_metadata, asset_map
1199+
self, experiment, path, asset_type, asset_data, asset_map
12001200
):
1201-
log_as_filename = assets_metadata[log_filename].get(
1202-
"logAsFileName",
1203-
None,
1204-
)
1205-
step = assets_metadata[log_filename].get("step")
1206-
epoch = assets_metadata[log_filename].get("epoch")
1207-
old_asset_id = assets_metadata[log_filename].get("assetId")
1201+
log_as_filename = asset_data.get("logAsFileName", None)
1202+
original_filename = asset_data["fileName"]
1203+
disk_filename = asset_data.get("diskFileName", original_filename)
1204+
step = asset_data.get("step")
1205+
epoch = asset_data.get("epoch")
1206+
old_asset_id = asset_data.get("assetId")
12081207
if asset_type in self.ignore:
12091208
return
1210-
sanitized_filename = sanitize_filename(log_filename)
1209+
sanitized_filename = sanitize_filename(disk_filename)
12111210
filename = os.path.join(path, asset_type, sanitized_filename)
12121211

12131212
if not os.path.isfile(filename):
@@ -1217,7 +1216,7 @@ def _log_asset(
12171216
print("Missing file %r: unable to copy" % filename)
12181217
return
12191218

1220-
metadata = assets_metadata[log_filename].get("metadata")
1219+
metadata = asset_data.get("metadata")
12211220
metadata = json.loads(metadata) if metadata else {}
12221221

12231222
if asset_type == "notebook":
@@ -1286,11 +1285,11 @@ def _log_asset(
12861285
metadata,
12871286
binary_io,
12881287
step,
1289-
log_as_filename or log_filename,
1288+
log_as_filename or original_filename,
12901289
)
12911290
if result is None:
12921291
print(
1293-
f"ERROR: Unable to log {asset_type} asset {log_as_filename or log_filename}; skipping"
1292+
f"ERROR: Unable to log {asset_type} asset {log_as_filename or original_filename}; skipping"
12941293
)
12951294
else:
12961295
asset_map[old_asset_id] = result["assetId"]
@@ -1301,7 +1300,7 @@ def _log_asset(
13011300
metadata,
13021301
step,
13031302
filename,
1304-
log_as_filename or log_filename,
1303+
log_as_filename or original_filename,
13051304
old_asset_id,
13061305
)
13071306
elif asset_type == "confusion-matrix":
@@ -1335,47 +1334,46 @@ def _log_asset(
13351334
metadata,
13361335
binary_io,
13371336
step,
1338-
log_as_filename or log_filename,
1337+
log_as_filename or original_filename,
13391338
)
13401339
if result is None:
13411340
print(
1342-
f"ERROR: Unable to log {asset_type} asset {log_as_filename or log_filename}; skipping"
1341+
f"ERROR: Unable to log {asset_type} asset {log_as_filename or original_filename}; skipping"
13431342
)
13441343
else:
13451344
asset_map[old_asset_id] = result["assetId"]
13461345
elif asset_type == "video":
1347-
name = os.path.basename(filename)
13481346
binary_io = open(filename, "rb")
13491347
result = experiment.log_video(
1350-
binary_io, name=log_as_filename or name, step=step, epoch=epoch
1351-
) # done!
1348+
binary_io,
1349+
name=log_as_filename or original_filename,
1350+
step=step,
1351+
epoch=epoch,
1352+
)
13521353
if result is None:
13531354
print(
1354-
f"ERROR: Unable to log {asset_type} asset {log_as_filename or name}; skipping"
1355+
f"ERROR: Unable to log {asset_type} asset {log_as_filename or original_filename}; skipping"
13551356
)
13561357
else:
13571358
asset_map[old_asset_id] = result["assetId"]
13581359
elif asset_type == "model-element":
1359-
dir_name = assets_metadata[log_filename].get("dir", "")
1360+
dir_name = asset_data.get("dir", "")
13601361
# The dir field includes a "models/" prefix added by the
13611362
# backend; strip it to get the actual model name.
13621363
if dir_name.startswith("models/"):
13631364
model_name = dir_name[len("models/"):]
13641365
else:
13651366
model_name = dir_name
13661367
binary_io = open(filename, "rb")
1367-
result = experiment._log_asset(
1368-
binary_io,
1369-
file_name=log_as_filename or log_filename,
1370-
copy_to_tmp=True,
1371-
asset_type=asset_type,
1368+
result = experiment.log_model(
1369+
name=model_name,
1370+
file_or_folder=binary_io,
1371+
file_name=log_as_filename or original_filename,
13721372
metadata=metadata,
1373-
step=step,
1374-
grouping_name=model_name,
13751373
)
13761374
if result is None:
13771375
print(
1378-
f"ERROR: Unable to log {asset_type} asset {log_as_filename or log_filename}; skipping"
1376+
f"ERROR: Unable to log {asset_type} asset {log_as_filename or original_filename}; skipping"
13791377
)
13801378
else:
13811379
asset_map[old_asset_id] = result["assetId"]
@@ -1386,11 +1384,11 @@ def _log_asset(
13861384
metadata,
13871385
filename,
13881386
step,
1389-
log_as_filename or log_filename,
1387+
log_as_filename or original_filename,
13901388
)
13911389
if result is None:
13921390
print(
1393-
f"ERROR: Unable to log {asset_type} asset {log_as_filename or log_filename}; skipping"
1391+
f"ERROR: Unable to log {asset_type} asset {log_as_filename or original_filename}; skipping"
13941392
)
13951393
else:
13961394
asset_map[old_asset_id] = result["assetId"]
@@ -1402,39 +1400,50 @@ def log_assets(self, experiment, path, assets_metadata):
14021400
# Create mapping from old asset id to new asset id
14031401
asset_map = {}
14041402
# Process all of the non-nested assets first:
1405-
for log_filename in assets_metadata:
1406-
asset_type = assets_metadata[log_filename].get("type", "asset") or "asset"
1403+
for asset_data in assets_metadata:
1404+
asset_type = asset_data.get("type", "asset") or "asset"
14071405
if asset_type not in ["confusion-matrix", "embeddings", "datagrid"]:
1408-
if (
1409-
"remote" in assets_metadata[log_filename]
1410-
and assets_metadata[log_filename]["remote"]
1411-
):
1412-
asset = assets_metadata[log_filename]
1413-
experiment.log_remote_asset(
1414-
uri=asset["link"],
1415-
remote_file_name=asset["fileName"],
1416-
step=asset["step"],
1417-
metadata=asset["metadata"],
1418-
)
1406+
if asset_data.get("remote", False):
1407+
if asset_type == "model-element":
1408+
dir_name = asset_data.get("dir", "")
1409+
if dir_name.startswith("models/"):
1410+
model_name = dir_name[len("models/"):]
1411+
else:
1412+
model_name = dir_name
1413+
raw_metadata = asset_data.get("metadata")
1414+
metadata = json.loads(raw_metadata) if raw_metadata else None
1415+
experiment.log_remote_model(
1416+
model_name=model_name,
1417+
uri=asset_data["link"],
1418+
metadata=metadata,
1419+
sync_mode=False,
1420+
)
1421+
else:
1422+
raw_metadata = asset_data.get("metadata")
1423+
metadata = json.loads(raw_metadata) if raw_metadata else None
1424+
experiment.log_remote_asset(
1425+
uri=asset_data["link"],
1426+
remote_file_name=asset_data["fileName"],
1427+
step=asset_data["step"],
1428+
metadata=metadata,
1429+
)
14191430
else:
14201431
self._log_asset(
14211432
experiment,
14221433
path,
14231434
asset_type,
1424-
log_filename,
1425-
assets_metadata,
1435+
asset_data,
14261436
asset_map,
14271437
)
14281438
# Process all nested assets:
1429-
for log_filename in assets_metadata:
1430-
asset_type = assets_metadata[log_filename].get("type", "asset") or "asset"
1439+
for asset_data in assets_metadata:
1440+
asset_type = asset_data.get("type", "asset") or "asset"
14311441
if asset_type in ["confusion-matrix", "embeddings", "datagrid"]:
14321442
self._log_asset(
14331443
experiment,
14341444
path,
14351445
asset_type,
1436-
log_filename,
1437-
assets_metadata,
1446+
asset_data,
14381447
asset_map,
14391448
)
14401449

@@ -1632,11 +1641,11 @@ def log_all(self, experiment, experiment_folder):
16321641
assets_metadata_filename = os.path.join(
16331642
experiment_folder, "assets", "assets_metadata.jsonl"
16341643
)
1635-
assets_metadata = {}
1644+
assets_metadata = []
16361645
if os.path.exists(assets_metadata_filename):
16371646
for line in open(assets_metadata_filename):
16381647
data = json.loads(line)
1639-
assets_metadata[data["fileName"]] = data
1648+
assets_metadata.append(data)
16401649

16411650
self.log_assets(
16421651
experiment,

cometx/framework/comet/download_manager.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1214,26 +1214,46 @@ def download_assets(self, experiment):
12141214
self.asset_type if self.asset_type else "all"
12151215
)
12161216
if len(assets) > 0:
1217+
filename_counts = {}
1218+
for asset in assets:
1219+
if asset["type"] == "audio" and asset["step"] is not None:
1220+
asset_filename = asset["fileName"]
1221+
asset["logAsFileName"] = asset_filename
1222+
if "." in asset_filename:
1223+
asset_filename, ext = asset_filename.split(".", 1)
1224+
else:
1225+
asset_filename, ext = asset_filename, ""
1226+
asset_filename = "%s-%s.%s" % (
1227+
asset_filename,
1228+
asset["step"],
1229+
ext,
1230+
)
1231+
asset["fileName"] = asset_filename
1232+
fn = asset["fileName"]
1233+
if fn in filename_counts:
1234+
filename_counts[fn] += 1
1235+
if "." in fn:
1236+
base, ext = fn.rsplit(".", 1)
1237+
asset["diskFileName"] = "%s_%d.%s" % (
1238+
base,
1239+
filename_counts[fn],
1240+
ext,
1241+
)
1242+
else:
1243+
asset["diskFileName"] = "%s_%d" % (
1244+
fn,
1245+
filename_counts[fn],
1246+
)
1247+
else:
1248+
filename_counts[fn] = 0
1249+
12171250
filename = "assets_metadata.jsonl"
12181251
filepath = os.path.join(assets_path, filename)
12191252
if self._should_write(filepath):
12201253
self.summary["assets"] += 1
12211254
os.makedirs(assets_path, exist_ok=True)
12221255
with open(filepath, "w") as f:
12231256
for asset in assets:
1224-
if asset["type"] == "audio" and asset["step"] is not None:
1225-
asset_filename = asset["fileName"]
1226-
asset["logAsFileName"] = asset_filename
1227-
if "." in asset_filename:
1228-
asset_filename, ext = asset_filename.split(".", 1)
1229-
else:
1230-
asset_filename, ext = asset_filename, ""
1231-
asset_filename = "%s-%s.%s" % (
1232-
asset_filename,
1233-
asset["step"],
1234-
ext,
1235-
)
1236-
asset["fileName"] = asset_filename
12371257
f.write(json.dumps(asset))
12381258
f.write("\n")
12391259

@@ -1244,9 +1264,10 @@ def download_assets(self, experiment):
12441264
path = assets_path
12451265
else:
12461266
path = os.path.join(assets_path, asset_type)
1247-
filename = sanitize_filename(asset["fileName"])
1267+
filename = sanitize_filename(
1268+
asset.get("diskFileName", asset["fileName"])
1269+
)
12481270
file_path = os.path.join(path, filename)
1249-
# Don't download a filename more than once:
12501271
if file_path not in filenames and self._should_write(file_path):
12511272
filenames.add(file_path)
12521273
self.summary["assets"] += 1

0 commit comments

Comments
 (0)