#!/usr/bin/env python3 """ Extract Spotify URLs for Top 500 Albums This script searches for each album on Spotify using the Spotify Web API and creates a mapping file similar to the Wikipedia URLs. Note: You'll need to set up a Spotify app at https://developer.spotify.com/ and get your client credentials. """ import csv import json import time import urllib.parse import urllib.request import base64 from typing import Dict, Optional # Spotify API credentials (you'll need to get these from Spotify Developer Dashboard) CLIENT_ID = "your_client_id_here" CLIENT_SECRET = "your_client_secret_here" def get_spotify_access_token() -> Optional[str]: """Get access token from Spotify API""" auth_url = "https://accounts.spotify.com/api/token" # Encode credentials credentials = f"{CLIENT_ID}:{CLIENT_SECRET}" encoded_credentials = base64.b64encode(credentials.encode()).decode() headers = { 'Authorization': f'Basic {encoded_credentials}', 'Content-Type': 'application/x-www-form-urlencoded' } data = 'grant_type=client_credentials' try: request = urllib.request.Request(auth_url, data=data.encode(), headers=headers) response = urllib.request.urlopen(request) result = json.loads(response.read().decode()) return result.get('access_token') except Exception as e: print(f"Error getting access token: {e}") return None def search_spotify_album(artist: str, album: str, access_token: str) -> Optional[str]: """Search for album on Spotify and return the album URL""" # Clean up artist and album names for search search_artist = artist.replace("&", "and").strip() search_album = album.replace("&", "and").strip() # Remove common prefixes that might confuse search if search_artist.startswith("The "): search_artist_alt = search_artist[4:] else: search_artist_alt = f"The {search_artist}" # Try different search strategies search_queries = [ f'album:"{search_album}" artist:"{search_artist}"', f'album:"{search_album}" artist:"{search_artist_alt}"', f'"{search_album}" "{search_artist}"', f'"{search_album}" "{search_artist_alt}"', f'{search_album} {search_artist}', ] for query in search_queries: try: encoded_query = urllib.parse.quote(query) search_url = f"https://api.spotify.com/v1/search?q={encoded_query}&type=album&limit=10" headers = { 'Authorization': f'Bearer {access_token}' } request = urllib.request.Request(search_url, headers=headers) response = urllib.request.urlopen(request) data = json.loads(response.read().decode()) albums = data.get('albums', {}).get('items', []) if albums: # Look for the best match for spotify_album in albums: spotify_name = spotify_album['name'].lower() spotify_artist = spotify_album['artists'][0]['name'].lower() # Check for exact or close matches if (album.lower() in spotify_name or spotify_name in album.lower()) and \ (artist.lower() in spotify_artist or spotify_artist in artist.lower()): return spotify_album['external_urls']['spotify'] # If no perfect match, return the first result return albums[0]['external_urls']['spotify'] # Rate limiting time.sleep(0.1) except Exception as e: print(f"Error searching for {artist} - {album}: {e}") time.sleep(1) continue return None def main(): """Main function to extract Spotify URLs""" print("Spotify URL Extractor for Top 500 Albums") print("=" * 50) # Check if credentials are set if CLIENT_ID == "your_client_id_here" or CLIENT_SECRET == "your_client_secret_here": print("ERROR: Please set your Spotify API credentials in the script!") print("1. Go to https://developer.spotify.com/") print("2. Create an app and get your Client ID and Client Secret") print("3. Replace CLIENT_ID and CLIENT_SECRET in this script") return # Get access token print("Getting Spotify access token...") access_token = get_spotify_access_token() if not access_token: print("Failed to get access token. Check your credentials.") return print("Access token obtained successfully!") # Read the albums data albums = [] try: with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) albums = list(reader) except FileNotFoundError: print("Error: top_500_albums_2023.csv not found!") return print(f"Found {len(albums)} albums to process") spotify_mappings = {} failed_albums = [] for i, album in enumerate(albums, 1): artist = album['Artist'] album_name = album['Album'] print(f"[{i}/{len(albums)}] Searching: {artist} - {album_name}") spotify_url = search_spotify_album(artist, album_name, access_token) if spotify_url: spotify_mappings[album_name] = spotify_url print(f" ✓ Found: {spotify_url}") else: failed_albums.append((artist, album_name)) print(f" ✗ Not found") # Rate limiting to be respectful to Spotify API time.sleep(0.2) # Save progress every 50 albums if i % 50 == 0: with open('spotify_urls_mapping_progress.json', 'w', encoding='utf-8') as f: json.dump(spotify_mappings, f, indent=2, ensure_ascii=False) print(f"Progress saved. Found {len(spotify_mappings)} URLs so far.") # Save the final mappings with open('spotify_urls_mapping.json', 'w', encoding='utf-8') as f: json.dump(spotify_mappings, f, indent=2, ensure_ascii=False) # Save failed albums for manual review if failed_albums: with open('failed_spotify_searches.txt', 'w', encoding='utf-8') as f: f.write("Albums not found on Spotify:\n") f.write("=" * 40 + "\n") for artist, album in failed_albums: f.write(f"{artist} - {album}\n") print(f"\nCompleted!") print(f"Successfully found Spotify URLs for {len(spotify_mappings)} albums") print(f"Failed to find {len(failed_albums)} albums") print(f"Success rate: {len(spotify_mappings)/len(albums)*100:.1f}%") print(f"\nResults saved to:") print(f" - spotify_urls_mapping.json") if failed_albums: print(f" - failed_spotify_searches.txt") if __name__ == "__main__": main()