From d74f76cdc1bca719a761606472cd6e2ce08e10e0 Mon Sep 17 00:00:00 2001 From: Faisal N Date: Sun, 7 Dec 2025 06:59:00 +0000 Subject: [PATCH] Deleteting the newest duplicates instead of the oldest (#177) --- dedup/__tests__/deduplicator.test.ts | 42 +++++++++++++++ dedup/deduplicator.ts | 81 ++++++++++++++++++++++------ dedup/spotifyApi.ts | 1 + 3 files changed, 109 insertions(+), 15 deletions(-) diff --git a/dedup/__tests__/deduplicator.test.ts b/dedup/__tests__/deduplicator.test.ts index 8b09cc0..f3a9554 100644 --- a/dedup/__tests__/deduplicator.test.ts +++ b/dedup/__tests__/deduplicator.test.ts @@ -22,6 +22,34 @@ describe('BaseDeduplicator', () => { expect(duplicates[0].index).toBe(2); }); + it('should keep the oldest duplicate by added_at timestamp', () => { + const tracks = [ + createTrack({ id: '1', name: 'Track 1', duration_ms: 1000, added_at: '2020-01-01T00:00:00Z' }), + createTrack({ id: '2', name: 'Track 2', duration_ms: 2000, added_at: '2020-01-02T00:00:00Z' }), + createTrack({ id: '1', name: 'Track 1', duration_ms: 1000, added_at: '2020-01-05T00:00:00Z' }), // Newer duplicate + ]; + + const duplicates = PlaylistDeduplicator.findDuplicatedTracks(tracks); + + expect(duplicates).toHaveLength(1); + expect(duplicates[0].reason).toBe('same-id'); + expect(duplicates[0].index).toBe(2); // Newer one is marked as duplicate + }); + + it('should mark older duplicate for removal if newer one is encountered', () => { + const tracks = [ + createTrack({ id: '1', name: 'Track 1', duration_ms: 1000, added_at: '2020-01-05T00:00:00Z' }), + createTrack({ id: '2', name: 'Track 2', duration_ms: 2000, added_at: '2020-01-02T00:00:00Z' }), + createTrack({ id: '1', name: 'Track 1', duration_ms: 1000, added_at: '2020-01-01T00:00:00Z' }), // Older duplicate + ]; + + const duplicates = PlaylistDeduplicator.findDuplicatedTracks(tracks); + + expect(duplicates).toHaveLength(1); + expect(duplicates[0].reason).toBe('same-id'); + expect(duplicates[0].index).toBe(0); // Older one is marked as duplicate + }); + it('should identify duplicates with same name and artist', () => { const tracks = [ createTrack({ id: '1', name: 'Track 1', duration_ms: 1000 }), @@ -34,6 +62,20 @@ describe('BaseDeduplicator', () => { expect(duplicates[0].reason).toBe('same-name-artist'); }); + it('should keep oldest by timestamp for same-name-artist duplicates', () => { + const tracks = [ + createTrack({ id: '1', name: 'Track 1', duration_ms: 1000, added_at: '2020-01-05T00:00:00Z' }), + createTrack({ id: '2', name: 'Track 1', duration_ms: 1001, added_at: '2020-01-01T00:00:00Z' }), // Older + createTrack({ id: '3', name: 'Track 1', duration_ms: 1002, added_at: '2020-01-10T00:00:00Z' }), // Newest + ]; + + const duplicates = PlaylistDeduplicator.findDuplicatedTracks(tracks); + + expect(duplicates).toHaveLength(2); + expect(duplicates.map(d => d.index).sort()).toEqual([0, 2]); + expect(duplicates.every(d => d.reason === 'same-name-artist')).toBe(true); + }); + it('should ignore null tracks', () => { const tracks = [ createTrack({ id: '1' }), diff --git a/dedup/deduplicator.ts b/dedup/deduplicator.ts index 2ac9829..0de75a5 100644 --- a/dedup/deduplicator.ts +++ b/dedup/deduplicator.ts @@ -13,8 +13,8 @@ class BaseDeduplicator { } static findDuplicatedTracks(tracks: Array) { - const seenIds: { [key: string]: boolean } = {}; - const seenNameAndArtist: { [key: string]: Array } = {}; + const seenIds: { [key: string]: { index: number; added_at?: string } } = {}; + const seenNameAndArtist: { [key: string]: Array<{ index: number; duration: number; added_at?: string }> } = {}; let duplicates: Array = []; const result = tracks.reduce((duplicates, track, index) => { if (track === null) return duplicates; @@ -25,31 +25,78 @@ class BaseDeduplicator { if (track.id in seenIds) { // if the two tracks have the same Spotify ID, they are duplicates reasonDuplicate = 'same-id'; + // Compare added_at timestamps. If current track is older, mark the previous one as duplicate instead + const previousEntry = seenIds[track.id]; + if (track.added_at && previousEntry.added_at && track.added_at < previousEntry.added_at) { + // Current track is older, so remove the previous occurrence instead + // Remove the duplicate entry for the previous index and add it for the new one + duplicates = duplicates.filter(d => d.index !== previousEntry.index); + seenIds[track.id] = { index: index, added_at: track.added_at }; + duplicates.push({ + index: previousEntry.index, + track: tracks[previousEntry.index], + reason: reasonDuplicate, + }); + } else { + // Current track is newer, mark it as duplicate + duplicates.push({ + index: index, + track: track, + reason: reasonDuplicate, + }); + } } else { // if they have the same name, main artist, and roughly same duration - // we consider tem duplicates too + // we consider them duplicates too if (seenNameAndArtistKey in seenNameAndArtist) { // we check if _any_ of the previous durations is similar to the one we are checking if ( seenNameAndArtist[seenNameAndArtistKey].filter( - (duration) => Math.abs(duration - track.duration_ms) < 2000 + (duration) => Math.abs(duration.duration - track.duration_ms) < 2000 ).length !== 0 ) { reasonDuplicate = 'same-name-artist'; + // Find the oldest entry and keep that one + const previousEntries = seenNameAndArtist[seenNameAndArtistKey]; + const sortedEntries = [...previousEntries, { index, duration: track.duration_ms, added_at: track.added_at }] + .sort((a, b) => { + // Sort by added_at in ascending order (oldest first) + if (a.added_at && b.added_at) { + return a.added_at.localeCompare(b.added_at); + } + // If no timestamp, fall back to index order + return a.index - b.index; + }); + + // Keep the oldest, mark the rest as duplicates + const oldestIndex = sortedEntries[0].index; + const newerIndices = sortedEntries.slice(1).map(e => e.index); + + // Remove any duplicates we previously marked for this group + duplicates = duplicates.filter(d => { + const prevEntry = previousEntries.find(e => e.index === d.index); + return !prevEntry; + }); + + // Add duplicates for all indices except the oldest + newerIndices.forEach(newIndex => { + duplicates.push({ + index: newIndex, + track: tracks[newIndex], + reason: reasonDuplicate, + }); + }); + + // Update seenNameAndArtist to only track the oldest + seenNameAndArtist[seenNameAndArtistKey] = sortedEntries; } } } - if (reasonDuplicate !== null) { - duplicates.push({ - index: index, - track: track, - reason: reasonDuplicate, - }); - } else { - seenIds[track.id] = true; + if (reasonDuplicate === null) { + seenIds[track.id] = { index: index, added_at: track.added_at }; seenNameAndArtist[seenNameAndArtistKey] = seenNameAndArtist[seenNameAndArtistKey] || []; - seenNameAndArtist[seenNameAndArtistKey].push(track.duration_ms); + seenNameAndArtist[seenNameAndArtistKey].push({ index: index, duration: track.duration_ms, added_at: track.added_at }); } return duplicates; }, duplicates); @@ -82,7 +129,9 @@ export class PlaylistDeduplicator extends BaseDeduplicator { pages.forEach((page) => { page.items.forEach((item: SpotifyPlaylistTrack) => { if (item?.track) { - tracks.push(item.track); + // Preserve the added_at timestamp from the playlist item + const trackWithTimestamp = { ...item.track, added_at: item.added_at }; + tracks.push(trackWithTimestamp); } }); }); @@ -166,7 +215,9 @@ export class SavedTracksDeduplicator extends BaseDeduplicator { pages.forEach((page) => { page.items.forEach((item: SpotifySavedTrack) => { if (item?.track) { - tracks.push(item.track); + // Preserve the added_at timestamp from the saved track wrapper + const trackWithTimestamp = { ...item.track, added_at: item.added_at }; + tracks.push(trackWithTimestamp); } }); }); diff --git a/dedup/spotifyApi.ts b/dedup/spotifyApi.ts index 64242c7..c6b76a1 100644 --- a/dedup/spotifyApi.ts +++ b/dedup/spotifyApi.ts @@ -12,6 +12,7 @@ export interface SpotifyTrack { linked_from?: SpotifyTrack; name: string; uri: string; + added_at?: string; } export interface SpotifyPlaylist {