top500albums/extract_spotify_urls.py

#!/usr/bin/env python3
"""
Extract Spotify URLs for Top 500 Albums

This script searches for each album on Spotify using the Spotify Web API
and creates a mapping file similar to the Wikipedia URLs.

Note: You'll need to set up a Spotify app at https://developer.spotify.com/
and get your client credentials.
"""

import csv
import json
import time
import urllib.parse
import urllib.request
import base64
from typing import Dict, Optional

# Spotify API credentials (you'll need to get these from Spotify Developer Dashboard)
CLIENT_ID = "7b82a80e9292465e8e778f0e5fc6f017"
CLIENT_SECRET = "dacbe619d3c849c48e1b01dffaf89b53"

def get_spotify_access_token() -> Optional[str]:
    """Get access token from Spotify API"""
    auth_url = "https://accounts.spotify.com/api/token"

    # Encode credentials
    credentials = f"{CLIENT_ID}:{CLIENT_SECRET}"
    encoded_credentials = base64.b64encode(credentials.encode()).decode()

    headers = {
        'Authorization': f'Basic {encoded_credentials}',
        'Content-Type': 'application/x-www-form-urlencoded'
    }

    data = 'grant_type=client_credentials'

    try:
        request = urllib.request.Request(auth_url, data=data.encode(), headers=headers)
        response = urllib.request.urlopen(request)
        result = json.loads(response.read().decode())
        return result.get('access_token')
    except Exception as e:
        print(f"Error getting access token: {e}")
        return None

def search_spotify_album(artist: str, album: str, access_token: str) -> Optional[str]:
    """Search for album on Spotify and return the album URL"""

    # Clean up artist and album names for search
    search_artist = artist.replace("&", "and").strip()
    search_album = album.replace("&", "and").strip()

    # Remove common prefixes that might confuse search
    if search_artist.startswith("The "):
        search_artist_alt = search_artist[4:]
    else:
        search_artist_alt = f"The {search_artist}"

    # Try different search strategies
    search_queries = [
        f'album:"{search_album}" artist:"{search_artist}"',
        f'album:"{search_album}" artist:"{search_artist_alt}"',
        f'"{search_album}" "{search_artist}"',
        f'"{search_album}" "{search_artist_alt}"',
        f'{search_album} {search_artist}',
    ]

    for query in search_queries:
        try:
            encoded_query = urllib.parse.quote(query)
            search_url = f"https://api.spotify.com/v1/search?q={encoded_query}&type=album&limit=10"

            headers = {
                'Authorization': f'Bearer {access_token}'
            }

            request = urllib.request.Request(search_url, headers=headers)
            response = urllib.request.urlopen(request)
            data = json.loads(response.read().decode())

            albums = data.get('albums', {}).get('items', [])

            if albums:
                # Look for the best match
                for spotify_album in albums:
                    spotify_name = spotify_album['name'].lower()
                    spotify_artist = spotify_album['artists'][0]['name'].lower()

                    # Check for exact or close matches
                    if (album.lower() in spotify_name or spotify_name in album.lower()) and \
                       (artist.lower() in spotify_artist or spotify_artist in artist.lower()):
                        return spotify_album['external_urls']['spotify']

                # If no perfect match, return the first result
                return albums[0]['external_urls']['spotify']

            # Rate limiting
            time.sleep(0.1)

        except Exception as e:
            print(f"Error searching for {artist} - {album}: {e}")
            time.sleep(1)
            continue

    return None

def main():
    """Main function to extract Spotify URLs"""

    print("Spotify URL Extractor for Top 500 Albums")
    print("=" * 50)

    # Check if credentials are set
    if CLIENT_ID == "your_client_id_here" or CLIENT_SECRET == "your_client_secret_here":
        print("ERROR: Please set your Spotify API credentials in the script!")
        print("1. Go to https://developer.spotify.com/")
        print("2. Create an app and get your Client ID and Client Secret")
        print("3. Replace CLIENT_ID and CLIENT_SECRET in this script")
        return

    # Get access token
    print("Getting Spotify access token...")
    access_token = get_spotify_access_token()
    if not access_token:
        print("Failed to get access token. Check your credentials.")
        return

    print("Access token obtained successfully!")

    # Read the albums data
    albums = []
    try:
        with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            albums = list(reader)
    except FileNotFoundError:
        print("Error: top_500_albums_2023.csv not found!")
        return

    print(f"Found {len(albums)} albums to process")

    spotify_mappings = {}
    failed_albums = []

    for i, album in enumerate(albums, 1):
        artist = album['Artist']
        album_name = album['Album']

        print(f"[{i}/{len(albums)}] Searching: {artist} - {album_name}")

        spotify_url = search_spotify_album(artist, album_name, access_token)

        if spotify_url:
            spotify_mappings[album_name] = spotify_url
            print(f"  ✓ Found: {spotify_url}")
        else:
            failed_albums.append((artist, album_name))
            print(f"  ✗ Not found")

        # Rate limiting to be respectful to Spotify API
        time.sleep(0.2)

        # Save progress every 50 albums
        if i % 50 == 0:
            with open('spotify_urls_mapping_progress.json', 'w', encoding='utf-8') as f:
                json.dump(spotify_mappings, f, indent=2, ensure_ascii=False)
            print(f"Progress saved. Found {len(spotify_mappings)} URLs so far.")

    # Save the final mappings
    with open('spotify_urls_mapping.json', 'w', encoding='utf-8') as f:
        json.dump(spotify_mappings, f, indent=2, ensure_ascii=False)

    # Save failed albums for manual review
    if failed_albums:
        with open('failed_spotify_searches.txt', 'w', encoding='utf-8') as f:
            f.write("Albums not found on Spotify:\n")
            f.write("=" * 40 + "\n")
            for artist, album in failed_albums:
                f.write(f"{artist} - {album}\n")

    print(f"\nCompleted!")
    print(f"Successfully found Spotify URLs for {len(spotify_mappings)} albums")
    print(f"Failed to find {len(failed_albums)} albums")
    print(f"Success rate: {len(spotify_mappings)/len(albums)*100:.1f}%")
    print(f"\nResults saved to:")
    print(f"  - spotify_urls_mapping.json")
    if failed_albums:
        print(f"  - failed_spotify_searches.txt")

if __name__ == "__main__":
    main()