@@ -67,22 +67,24 @@ songs <- get_songs(url)
6767# Convert to dataframe
6868df_songs <- bind_rows(lapply(songs , as.data.frame.list ))
6969
70- # str(df_songs)
71-
70+ # Selecting only columns that matter
7271df_songs_filtered <- df_songs %> % select(id , title , title_short , isrc , duration , rank , type , artist.id , artist.name , artist.type , album.id , album.title , album.type )
7372
74- print(df_songs_filtered )
73+ # Review the results
74+ # print(df_songs_filtered)
7575
76+ # Cleaning artist name to lowercase and changing header name
7677dupSongs_df <- data.frame (tolower(df_songs_filtered $ artist.name ), stringsAsFactors = FALSE )
7778colnames(dupSongs_df ) <- " artist"
7879
80+ # Cleaning song title to lowercase
7981dupSongs_df $ title <- tolower(df_songs_filtered $ title )
8082
81- dupSongs_df $ duration <- tolower(df_songs_filtered $ duration )
83+ # Adding duration to DF
84+ dupSongs_df $ duration <- df_songs_filtered $ duration
8285
83- dupSongs_df <- rownames_to_column(dupSongs_df , var = " ID" )
84-
85- print(dupSongs_df )
86+ # Review the results
87+ # print(dupSongs_df)
8688
8789# ############# Using Levenshtein Similarity
8890
@@ -95,8 +97,8 @@ rpairsLeven <- epiWeights(rpairsLeven)
9597# Get pairs with a high probability of being duplicates
9698duplicatesLeven <- getPairs(rpairsLeven , min.weight = 0.79 , max.weight = 0.99 )
9799
98- # summary(epiClassify(rpairsLeven,0.6))
99100# Review the results
101+ # summary(epiClassify(rpairsLeven,0.6))
100102# print(duplicatesLeven)
101103
102104
@@ -128,7 +130,10 @@ subLV$source <- "Levenshtein"
128130# Appending all those into a dataframe, filtering by similarity of artist, title and durantion
129131duplicatesTotal <- rbind(subJW , subLV )
130132
131- # Join back to the original dataset, to get track info and allow validation of duplicates
133+ # Creating ID column based on row number to original DF
134+ dupSongs_df <- rownames_to_column(dupSongs_df , var = " ID" )
135+
136+ # Join total duplicates found with the original dataset, to get track info and allow validation of duplicates
132137finalDupSongs <- merge(duplicatesTotal , dupSongs_df , by.x = " id1" , by.y = " ID" )
133138# colnames(finalDupSongs) <- c("id1","id2","artist_sim","title_sim","duration_sim","is_match","AvgWeight","source","artist_1","title_1","duration_1")
134139finalDupSongs <- merge(finalDupSongs , dupSongs_df , by.x = " id2" , by.y = " ID" )
0 commit comments