top500albums/scripts/download_album_covers_simple.py

#!/usr/bin/env python3
"""
Download album cover images for Top 500 Albums using iTunes Search API
Simple version using only built-in Python modules
"""

import urllib.request
import urllib.parse
import json
import csv
import os
import re
import time

def sanitize_filename(text):
    """Remove or replace characters that aren't valid in filenames"""
    # Remove/replace problematic characters
    text = re.sub(r'[<>:"/\\|?*]', '', text)
    text = re.sub(r'[^\w\s\-_\.]', '', text)
    text = re.sub(r'\s+', '_', text.strip())
    return text[:100]  # Limit length

def search_itunes(artist, album):
    """Search iTunes API for album artwork"""
    # Clean up search terms
    search_term = f"{artist} {album}".strip()
    search_term = re.sub(r'\([^)]*\)', '', search_term)  # Remove parentheses content
    search_term = re.sub(r'\s+', ' ', search_term).strip()

    # URL encode the search term
    encoded_term = urllib.parse.quote(search_term)
    url = f"https://itunes.apple.com/search?term={encoded_term}&media=music&entity=album&limit=5"

    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            data = json.loads(response.read().decode())

        if data['resultCount'] > 0:
            # Try to find the best match
            for result in data['results']:
                result_artist = result.get('artistName', '').lower()
                result_album = result.get('collectionName', '').lower()

                # Simple matching - check if artist and album names are similar
                if (artist.lower() in result_artist or result_artist in artist.lower()) and \
                   (album.lower() in result_album or result_album in album.lower()):
                    artwork_url = result.get('artworkUrl100', '').replace('100x100', '600x600')
                    return artwork_url

            # If no exact match, return the first result
            first_result = data['results'][0]
            artwork_url = first_result.get('artworkUrl100', '').replace('100x100', '600x600')
            return artwork_url

    except Exception as e:
        print(f"Error searching for {artist} - {album}: {e}")
        return None

    return None

def download_sample_covers(limit=10):
    """Download a sample of album covers to test the system"""

    # Create covers directory
    covers_dir = 'covers'
    if not os.path.exists(covers_dir):
        os.makedirs(covers_dir)

    # Read the CSV file
    csv_file = 'top_500_albums_2023.csv'
    if not os.path.exists(csv_file):
        print(f"Error: {csv_file} not found!")
        return

    albums_processed = 0
    albums_found = 0

    print(f"Downloading sample of {limit} album covers...")

    with open(csv_file, 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)

        for row in csv_reader:
            if albums_processed >= limit:
                break

            rank = row.get('Rank', '').strip()
            artist = row.get('Artist', '').strip()
            album = row.get('Album', '').strip()

            if not artist or not album:
                continue

            albums_processed += 1

            # Create filename
            safe_artist = sanitize_filename(artist)
            safe_album = sanitize_filename(album)
            filename = f"rank_{rank.zfill(3)}_{safe_artist}_{safe_album}.jpg"
            filepath = os.path.join(covers_dir, filename)

            # Skip if already downloaded
            if os.path.exists(filepath):
                print(f"✓ Already exists: {rank}. {artist} - {album}")
                albums_found += 1
                continue

            print(f"Searching: {rank}. {artist} - {album}")

            # Search for artwork
            artwork_url = search_itunes(artist, album)

            if artwork_url:
                try:
                    print(f"  Downloading from: {artwork_url}")
                    urllib.request.urlretrieve(artwork_url, filepath)
                    print(f"  ✓ Downloaded: {filename}")
                    albums_found += 1

                except Exception as e:
                    print(f"  ✗ Download failed: {e}")
            else:
                print(f"  ✗ No artwork found")

            # Be nice to the API
            time.sleep(1)

    print(f"\nSample Results:")
    print(f"Albums processed: {albums_processed}")
    print(f"Artwork found: {albums_found}")
    print(f"Success rate: {albums_found/albums_processed*100:.1f}%")

def download_top_albums(limit=50):
    """Download covers for top N albums"""

    # Create covers directory
    covers_dir = 'covers'
    if not os.path.exists(covers_dir):
        os.makedirs(covers_dir)

    # Read the CSV file
    csv_file = 'top_500_albums_2023.csv'
    if not os.path.exists(csv_file):
        print(f"Error: {csv_file} not found!")
        return

    albums_processed = 0
    albums_found = 0

    print(f"Downloading covers for top {limit} albums...")

    with open(csv_file, 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)

        # Sort by rank to get top albums first
        rows = list(csv_reader)
        rows.sort(key=lambda x: int(x.get('Rank', 999)))

        for row in rows[:limit]:
            rank = row.get('Rank', '').strip()
            artist = row.get('Artist', '').strip()
            album = row.get('Album', '').strip()

            if not artist or not album:
                continue

            albums_processed += 1

            # Create filename
            safe_artist = sanitize_filename(artist)
            safe_album = sanitize_filename(album)
            filename = f"rank_{rank.zfill(3)}_{safe_artist}_{safe_album}.jpg"
            filepath = os.path.join(covers_dir, filename)

            # Skip if already downloaded
            if os.path.exists(filepath):
                print(f"✓ Already exists: {rank}. {artist} - {album}")
                albums_found += 1
                continue

            print(f"Searching: {rank}. {artist} - {album}")

            # Search for artwork
            artwork_url = search_itunes(artist, album)

            if artwork_url:
                try:
                    print(f"  Downloading from: {artwork_url}")
                    urllib.request.urlretrieve(artwork_url, filepath)
                    print(f"  ✓ Downloaded: {filename}")
                    albums_found += 1

                except Exception as e:
                    print(f"  ✗ Download failed: {e}")
            else:
                print(f"  ✗ No artwork found")

            # Be nice to the API
            time.sleep(1)

    print(f"\nTop {limit} Results:")
    print(f"Albums processed: {albums_processed}")
    print(f"Artwork found: {albums_found}")
    print(f"Success rate: {albums_found/albums_processed*100:.1f}%")

if __name__ == "__main__":
    print("Top 500 Albums Cover Art Downloader (Simple Version)")
    print("==================================================")

    choice = input("Choose option:\n1. Download sample (10 albums)\n2. Download top 50 albums\n3. Download top 100 albums\nEnter choice (1-3): ")

    if choice == '1':
        download_sample_covers(10)
    elif choice == '2':
        download_top_albums(50)
    elif choice == '3':
        download_top_albums(100)
    else:
        print("Invalid choice!")

    print("\nDone!")