top500albums/scripts/download_missing_covers.py
Johan Lundberg 872fdfa0ee Organize repository by moving all scripts to scripts/ folder
- Moved all Python processing scripts to scripts/ directory for better organization
- Preserves git history using git mv command
- Clean separation between main project files and utility scripts

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-01 00:36:08 +02:00

162 lines
No EOL
6 KiB
Python

#!/usr/bin/env python3
"""
Download album covers for albums that don't have covers yet
"""
import urllib.request
import urllib.parse
import json
import csv
import os
import re
import time
def sanitize_filename(text):
"""Remove or replace characters that aren't valid in filenames"""
# Remove/replace problematic characters
text = re.sub(r'[<>:"/\\|?*]', '', text)
text = re.sub(r'[^\w\s\-_\.]', '', text)
text = re.sub(r'\s+', '_', text.strip())
return text[:100] # Limit length
def search_itunes(artist, album):
"""Search iTunes API for album artwork"""
# Clean up search terms
search_term = f"{artist} {album}".strip()
search_term = re.sub(r'\([^)]*\)', '', search_term) # Remove parentheses content
search_term = re.sub(r'\s+', ' ', search_term).strip()
# URL encode the search term
encoded_term = urllib.parse.quote(search_term)
url = f"https://itunes.apple.com/search?term={encoded_term}&media=music&entity=album&limit=5"
try:
with urllib.request.urlopen(url, timeout=15) as response:
data = json.loads(response.read().decode())
if data['resultCount'] > 0:
# Try to find the best match
for result in data['results']:
result_artist = result.get('artistName', '').lower()
result_album = result.get('collectionName', '').lower()
# Simple matching - check if artist and album names are similar
if (artist.lower() in result_artist or result_artist in artist.lower()) and \
(album.lower() in result_album or result_album in album.lower()):
artwork_url = result.get('artworkUrl100', '').replace('100x100', '600x600')
return artwork_url
# If no exact match, return the first result
first_result = data['results'][0]
artwork_url = first_result.get('artworkUrl100', '').replace('100x100', '600x600')
return artwork_url
except Exception as e:
print(f"Error searching for {artist} - {album}: {e}")
return None
return None
def download_missing_covers():
"""Download covers for albums that don't have covers yet"""
# Create covers directory if it doesn't exist
covers_dir = 'covers'
if not os.path.exists(covers_dir):
os.makedirs(covers_dir)
# Read the CSV file
csv_file = 'top_500_albums_2023.csv'
if not os.path.exists(csv_file):
print(f"Error: {csv_file} not found!")
return
albums_processed = 0
albums_found = 0
albums_skipped = 0
failed_albums = []
print("Downloading covers for albums without covers...")
print("This will take a while to be respectful to the iTunes API...\\n")
with open(csv_file, 'r', encoding='utf-8') as file:
csv_reader = csv.DictReader(file)
# Convert to list and sort by rank
rows = list(csv_reader)
rows.sort(key=lambda x: int(x.get('Rank', 999)))
for i, row in enumerate(rows):
rank = row.get('Rank', '').strip()
artist = row.get('Artist', '').strip()
album = row.get('Album', '').strip()
if not artist or not album:
continue
albums_processed += 1
# Create filename
safe_artist = sanitize_filename(artist)
safe_album = sanitize_filename(album)
filename = f"rank_{rank.zfill(3)}_{safe_artist}_{safe_album}.jpg"
filepath = os.path.join(covers_dir, filename)
# Skip if already exists
if os.path.exists(filepath):
albums_skipped += 1
continue
print(f"Searching [{albums_processed - albums_skipped}/500]: {rank}. {artist} - {album}")
# Search for artwork
artwork_url = search_itunes(artist, album)
if artwork_url:
try:
print(f" Downloading from: {artwork_url}")
urllib.request.urlretrieve(artwork_url, filepath)
print(f" ✓ Downloaded: {filename}")
albums_found += 1
except Exception as e:
print(f" ✗ Download failed: {e}")
failed_albums.append(f"{rank}. {artist} - {album}")
else:
print(f" ✗ No artwork found")
failed_albums.append(f"{rank}. {artist} - {album}")
# Be nice to the API
time.sleep(1.2)
# Progress update every 25 new downloads
if albums_found % 25 == 0 and albums_found > 0:
print(f"\\n--- Progress Update ---")
print(f"New downloads: {albums_found}")
print(f"Already existed: {albums_skipped}")
print(f"Failed: {len(failed_albums)}")
print("----------------------\\n")
print(f"\\n🎉 FINAL RESULTS:")
print(f"Albums processed: {albums_processed}")
print(f"Already existed: {albums_skipped}")
print(f"New downloads: {albums_found}")
print(f"Failed to find: {len(failed_albums)}")
print(f"Total covers available: {albums_found + albums_skipped}")
print(f"Overall success rate: {(albums_found + albums_skipped)/albums_processed*100:.1f}%")
if failed_albums:
print(f"\\n❌ Failed albums ({len(failed_albums)}):")
for album in failed_albums[:10]: # Show first 10
print(f" {album}")
if len(failed_albums) > 10:
print(f" ... and {len(failed_albums) - 10} more")
# Save failed albums to file
with open('failed_downloads.txt', 'w') as f:
for album in failed_albums:
f.write(f"{album}\\n")
print(f"\\nFull list saved to failed_downloads.txt")
if __name__ == "__main__":
download_missing_covers()