# devtools::install_github('charlie86/spotifyr') library(spotifyr) library(tidyverse) library(magrittr) library(ggridges) library(ggcorrplot) library(viridisLite) library(factoextra) library(ggiraphExtra)
# df <- get_artist_audio_features(artist = "bruce springsteen") df <- read_csv("https://raw.github.com/peerchristensen/Springsteen_album_clusters/master/springsteen_albums.csv") glimpse(df)
## Observations: 537 ## Variables: 31 ## $ artist_name <chr> "Bruce Springsteen", "Bruce Springsteen... ## $ artist_uri <chr> "3eqjTLE0HfPfh78zjh6TqT", "3eqjTLE0HfPf... ## $ album_uri <chr> "0PMasrHdpaoIRuHuhHp72O", "0PMasrHdpaoI... ## $ album_name <chr> "Born In The U.S.A.", "Born In The U.S.... ## $ album_img <chr> "https://i.scdn.co/image/d002b63ceb5658... ## $ album_type <chr> "album", "album", "album", "album", "al... ## $ is_collaboration <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS... ## $ album_release_date <chr> "1984-06-04", "1984-06-04", "1984-06-04... ## $ album_release_year <date> 1984-06-04, 1984-06-04, 1984-06-04, 19... ## $ album_popularity <dbl> 76, 76, 76, 76, 76, 76, 76, 76, 76, 76,... ## $ track_name <chr> "Born in the U.S.A.", "Cover Me", "Darl... ## $ track_uri <chr> "0dOg1ySSI7NkpAe89Zo0b9", "4U7NhC2rQTAh... ## $ track_number <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ... ## $ disc_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... ## $ danceability <dbl> 0.398, 0.535, 0.536, 0.429, 0.544, 0.62... ## $ energy <dbl> 0.952, 0.884, 0.982, 0.949, 0.762, 0.44... ## $ key <chr> "E", "A", "G", "C", "A#", "C#", "F", "A... ## $ loudness <dbl> -6.042, -5.499, -4.674, -5.295, -7.289,... ## $ mode <chr> "major", "minor", "major", "major", "ma... ## $ speechiness <dbl> 0.0610, 0.0407, 0.0389, 0.0458, 0.0382,... ## $ acousticness <dbl> 0.000373, 0.001880, 0.014100, 0.084200,... ## $ instrumentalness <dbl> 7.75e-05, 1.26e-03, 3.67e-05, 0.00e+00,... ## $ liveness <dbl> 0.1000, 0.1400, 0.2740, 0.1540, 0.0740,... ## $ valence <dbl> 0.584, 0.796, 0.963, 0.967, 0.473, 0.86... ## $ tempo <dbl> 122.093, 120.555, 119.201, 184.286, 120... ## $ duration_ms <dbl> 278680, 205987, 288027, 192267, 215427,... ## $ time_signature <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ... ## $ key_mode <chr> "E major", "A minor", "G major", "C maj... ## $ track_popularity <dbl> 72, 51, 45, 47, 49, 71, 50, 47, 53, 62,... ## $ track_preview_url <chr> "https://p.scdn.co/mp3-preview/3b6a5b91... ## $ track_open_spotify_url <chr> "https://open.spotify.com/track/0dOg1yS...
# some albums only have one song, some are alternate versions remove_albums <- c("Greatest Hits", "Hammersmith Odeon, London 75", "The Essential Bruce Springsteen (Bonus Disc)", "The Ties That Bind: The River Collection", "Chapter and Verse", "The Promise", "Tracks") df %<>% filter(!album_name %in% remove_albums, !grepl("live|Live",album_name)) %>% mutate(album_name = str_to_title(album_name)) df$album_name <- gsub(":.*","",df$album_name) df$album_name[grepl("Innocent",df$album_name)] <- "The Wild, The Innocent.." df$album_name[grepl("Greetings",df$album_name)] <- "Greetings" df$album_name[grepl("Darkness",df$album_name)] <- "Darkness"
df %>% select(key_mode) %>% group_by(key_mode) %>% count() %>% arrange(desc(n)) %>% ungroup() %>% top_n(5) %>% mutate(ordered = row_number()) %>% ggplot(aes(x = reorder(key_mode,desc(ordered)), y = n, fill = n)) + geom_col() + coord_flip() + ggtitle("Five most common keys") + scale_fill_viridis_c(option="B", direction = -1,guide=F) + theme_minimal() + labs(y = "n",x = "key")
正如我们所看到的,spotifyr从spotify API获取了许多有趣的数据。让我们先来看看每张专辑的舞蹈性。“天生就会跑步”的可舞性最低,而“爱的隧道”的可舞性最高。
df %>% group_by(album_name) %>% ggplot(aes(x = danceability, y = reorder(album_name,desc(album_release_year)), fill = reorder(album_name,desc(album_release_year)))) + geom_density_ridges(colour = "snow") + scale_fill_viridis_d(option = "B", begin = .05, direction = -1, guide = F) + theme_minimal() + ggtitle("Danceability") + labs(y="album")
df %>% gather(key = feature, value = measure, danceability, energy, loudness, valence, tempo, acousticness) %>% group_by(album_name) %>% ggplot(aes(x = measure, y = reorder(album_name,desc(album_release_year)), fill = album_release_date)) + geom_density_ridges(rel_min_height = 0.005, legend = F, alpha = .9, size = .2, colour = "snow") + facet_wrap(~feature, scales = "free", ncol = 2) + scale_fill_viridis_d(option ="B" ,begin = .05) + theme_minimal() + theme(axis.text.y = element_text(size = 7)) + labs(y = "album name") + ggtitle("Springsteen albums in six features", subtitle = "Acousticness, danceability, energy, loudness, tempo and valence") + guides(fill = FALSE)
sign_test <- df %>% select(acousticness,danceability,energy,loudness,tempo,valence) %>% cor_pmat() df %>% select(acousticness,danceability,energy,loudness,tempo,valence) %>% cor() %>% ggcorrplot(type = "lower", p.mat = sign_test, colors = c(inferno(5)[2], "snow", inferno(5)[4])) + ggtitle("Correlations between features", subtitle = "Non-significant correlations marked with X")
dfScale <- df %>% select(album_name,acousticness,danceability,energy,loudness,tempo,valence) %>% group_by(album_name) %>% summarise(acousticness = mean(scale(acousticness)), danceability = mean(scale(danceability)), energy = mean(scale(energy)), loudness = mean(scale(loudness)), tempo = mean(scale(tempo)), valence = mean(scale(valence))) %>% data.frame() row.names(dfScale) <- dfScale$album_name dfScale %<>% select(-album_name) %>% data.frame() df_dist <- get_dist(dfScale, stand = TRUE) fviz_dist(df_dist,gradient = list(low = inferno(5)[2], mid = "white", high = inferno(5)[4])) + theme_minimal() + ggtitle("Distance matrix", subtitle = "Similarity between albums based on all features") + theme(axis.text.x = element_text(hjust = 1,angle = 45), axis.title = element_blank())
dfScale %>% mutate(albums = row.names(dfScale)) %>% ggRadar(aes(group = albums), rescale = FALSE, legend.position = "none", size = 1, interactive = FALSE, use.label = TRUE) + facet_wrap(~albums) + scale_y_discrete(breaks = NULL) + theme(axis.text.x = element_text(size = 10)) + theme_minimal() + theme(legend.position = "none") + scale_fill_viridis_d(option="B") + scale_colour_viridis_d(option="B")
fviz_nbclust(dfScale, hcut) + ggtitle("Optimal Number of Clusters: H-Clustering")
df.hc <- hclust(dist(scale(dfScale))) fviz_dend(df.hc, k = 3, cex = .9, k_colors = inferno(10)[c(4,7)], color_labels_by_k = TRUE, rect = TRUE) + ggtitle("Hierachical Clustering")
fviz_nbclust(dfScale, kmeans) + ggtitle("Optimal Number of Clusters: K-means Clustering")
set.seed(324789) km.res <- kmeans(dfScale, 2, nstart = 25) fviz_cluster(km.res, data = dfScale, ellipse.type = "convex", repel = T, palette = inferno(10)[c(4,6,8)], ggtheme = theme_minimal(), main = "K-means Clustering")
作者:Peer Christensen 原文链接: https://peerchristensen.netlify.com/post/clustering-springsteen-albums-with-spotifyr/
