Skip to content

Commit 893f136

Browse files
authored
Update DeezerAnalysisAI.R
Fixed issue with ID column
1 parent b748ac0 commit 893f136

1 file changed

Lines changed: 14 additions & 9 deletions

File tree

DeezerAnalysisAI.R

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,22 +67,24 @@ songs <- get_songs(url)
6767
# Convert to dataframe
6868
df_songs <- bind_rows(lapply(songs, as.data.frame.list))
6969

70-
# str(df_songs)
71-
70+
# Selecting only columns that matter
7271
df_songs_filtered <- df_songs %>% select(id, title, title_short, isrc, duration, rank, type, artist.id, artist.name, artist.type, album.id, album.title, album.type)
7372

74-
print(df_songs_filtered)
73+
# Review the results
74+
# print(df_songs_filtered)
7575

76+
# Cleaning artist name to lowercase and changing header name
7677
dupSongs_df <- data.frame(tolower(df_songs_filtered$artist.name), stringsAsFactors = FALSE)
7778
colnames(dupSongs_df) <- "artist"
7879

80+
# Cleaning song title to lowercase
7981
dupSongs_df$title <- tolower(df_songs_filtered$title)
8082

81-
dupSongs_df$duration <- tolower(df_songs_filtered$duration)
83+
# Adding duration to DF
84+
dupSongs_df$duration <- df_songs_filtered$duration
8285

83-
dupSongs_df <- rownames_to_column(dupSongs_df, var = "ID")
84-
85-
print(dupSongs_df)
86+
# Review the results
87+
# print(dupSongs_df)
8688

8789
############## Using Levenshtein Similarity
8890

@@ -95,8 +97,8 @@ rpairsLeven <- epiWeights(rpairsLeven)
9597
# Get pairs with a high probability of being duplicates
9698
duplicatesLeven <- getPairs(rpairsLeven, min.weight=0.79, max.weight=0.99)
9799

98-
# summary(epiClassify(rpairsLeven,0.6))
99100
# Review the results
101+
# summary(epiClassify(rpairsLeven,0.6))
100102
# print(duplicatesLeven)
101103

102104

@@ -128,7 +130,10 @@ subLV$source <- "Levenshtein"
128130
# Appending all those into a dataframe, filtering by similarity of artist, title and durantion
129131
duplicatesTotal <- rbind(subJW, subLV)
130132

131-
# Join back to the original dataset, to get track info and allow validation of duplicates
133+
# Creating ID column based on row number to original DF
134+
dupSongs_df <- rownames_to_column(dupSongs_df, var = "ID")
135+
136+
# Join total duplicates found with the original dataset, to get track info and allow validation of duplicates
132137
finalDupSongs <- merge(duplicatesTotal, dupSongs_df, by.x = "id1", by.y = "ID")
133138
# colnames(finalDupSongs) <- c("id1","id2","artist_sim","title_sim","duration_sim","is_match","AvgWeight","source","artist_1","title_1","duration_1")
134139
finalDupSongs <- merge(finalDupSongs, dupSongs_df, by.x = "id2", by.y = "ID")

0 commit comments

Comments
 (0)