top500albums/scripts/download_dropped_covers.py

#!/usr/bin/env python3
"""
Download cover art for the dropped albums (ranks 501-523).
Uses iTunes API to search for and download album artwork.
"""

import csv
import urllib.request
import urllib.parse
import json
import time
import os
import re

def sanitize_filename(text):
    """Sanitize text for use in filenames"""
    # Remove problematic characters and replace with safe alternatives
    text = re.sub(r'[<>:"/\\|?*]', '', text)
    text = re.sub(r'[^\w\s\-_.]', '', text)
    text = re.sub(r'\s+', '_', text)
    return text[:100]  # Limit length

def search_itunes(artist, album):
    """Search iTunes API for album artwork"""
    try:
        # Clean up search terms
        search_term = f"{artist} {album}".strip()
        encoded_term = urllib.parse.quote(search_term)

        url = f"https://itunes.apple.com/search?term={encoded_term}&media=music&entity=album&limit=5"

        with urllib.request.urlopen(url, timeout=10) as response:
            data = json.loads(response.read().decode())

            if data['resultCount'] > 0:
                for result in data['results']:
                    # Check if this result matches our search
                    result_artist = result.get('artistName', '').lower()
                    result_album = result.get('collectionName', '').lower()

                    # Fuzzy matching - check if key terms are present
                    artist_words = artist.lower().split()
                    album_words = album.lower().split()

                    artist_match = any(word in result_artist for word in artist_words if len(word) > 2)
                    album_match = any(word in result_album for word in album_words if len(word) > 2)

                    if artist_match and album_match:
                        artwork_url = result.get('artworkUrl100', '')
                        if artwork_url:
                            # Get high resolution version
                            high_res_url = artwork_url.replace('100x100bb', '600x600bb')
                            return high_res_url

                # If no good match, return the first result's artwork
                first_result = data['results'][0]
                artwork_url = first_result.get('artworkUrl100', '')
                if artwork_url:
                    return artwork_url.replace('100x100bb', '600x600bb')

    except Exception as e:
        print(f"   Error searching iTunes for {artist} - {album}: {e}")

    return None

def download_cover(url, filepath):
    """Download cover image from URL"""
    try:
        with urllib.request.urlopen(url, timeout=15) as response:
            with open(filepath, 'wb') as f:
                f.write(response.read())
        return True
    except Exception as e:
        print(f"   Error downloading {filepath}: {e}")
        return False

def main():
    # Read the CSV to get dropped albums (ranks 501-523)
    dropped_albums = []

    with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            rank = int(row['Rank'])
            if rank >= 501:  # Dropped albums start at rank 501
                dropped_albums.append(row)

    print(f"🎨 Found {len(dropped_albums)} dropped albums needing cover art")
    print("📥 Starting download process...\n")

    # Create covers directory if it doesn't exist
    os.makedirs('covers', exist_ok=True)

    success_count = 0
    failed_downloads = []

    for i, album in enumerate(dropped_albums, 1):
        rank = album['Rank']
        artist = album['Artist']
        album_name = album['Album']

        # Generate filename
        safe_artist = sanitize_filename(artist)
        safe_album = sanitize_filename(album_name)
        rank_str = rank.zfill(3)
        filename = f"rank_{rank_str}_{safe_artist}_{safe_album}.jpg"
        filepath = os.path.join('covers', filename)

        print(f"[{i:2d}/{len(dropped_albums)}] #{rank} - {artist} - {album_name}")

        # Check if file already exists
        if os.path.exists(filepath):
            print(f"   ✓ Already exists: {filename}")
            success_count += 1
            continue

        # Search for artwork
        artwork_url = search_itunes(artist, album_name)

        if artwork_url:
            print(f"   🔍 Found artwork, downloading...")

            if download_cover(artwork_url, filepath):
                print(f"   ✅ Downloaded: {filename}")
                success_count += 1
            else:
                print(f"   ❌ Download failed: {filename}")
                failed_downloads.append((rank, artist, album_name))
        else:
            print(f"   ❌ No artwork found: {filename}")
            failed_downloads.append((rank, artist, album_name))

        # Rate limiting - be nice to iTunes API
        time.sleep(1.2)

    print(f"\n🎉 Download complete!")
    print(f"✅ Successfully downloaded: {success_count}/{len(dropped_albums)} covers")

    if failed_downloads:
        print(f"❌ Failed downloads: {len(failed_downloads)}")
        print("\nFailed albums:")
        for rank, artist, album in failed_downloads:
            print(f"   #{rank} - {artist} - {album}")

        # Save failed downloads to file
        with open('failed_dropped_downloads.txt', 'w', encoding='utf-8') as f:
            f.write("Failed to download cover art for these dropped albums:\n\n")
            for rank, artist, album in failed_downloads:
                f.write(f"#{rank} - {artist} - {album}\n")
        print(f"\n📝 Failed downloads saved to: failed_dropped_downloads.txt")

    coverage_percentage = (success_count / len(dropped_albums)) * 100
    print(f"\n📊 Coverage: {coverage_percentage:.1f}% of dropped albums have cover art")

if __name__ == "__main__":
    main()