top500albums/scripts/remap_covers.py

#!/usr/bin/env python3
"""
Script to remap existing cover art files to match the new CSV ranking structure.
This avoids having to re-download all the covers.
"""

import csv
import os
import re
import shutil
from pathlib import Path

def sanitize_filename(text):
    """Remove or replace characters that aren't valid in filenames"""
    # Remove/replace problematic characters
    text = re.sub(r'[<>:"/\\|?*]', '', text)
    text = re.sub(r'[^\w\s\-_\.]', '', text)
    text = re.sub(r'\s+', '_', text.strip())
    return text[:100]  # Limit length

def normalize_for_matching(text):
    """Normalize text for matching album/artist names"""
    text = text.lower().strip()
    # Remove common punctuation and normalize
    text = re.sub(r'[^\w\s&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Handle common variations
    text = text.replace(' and ', ' ').replace(' & ', ' ')
    # Remove "the" from start
    if text.startswith('the '):
        text = text[4:]
    return text

def find_matching_cover(artist, album, existing_covers):
    """Find existing cover file that matches this artist/album"""
    target_artist = normalize_for_matching(artist)
    target_album = normalize_for_matching(album)

    for cover_file in existing_covers:
        # Extract artist and album from filename
        # Format: rank_XXX_Artist_Album.jpg
        parts = cover_file.replace('.jpg', '').split('_')
        if len(parts) < 4:
            continue

        # Skip rank part, reconstruct artist and album
        file_parts = parts[2:]  # Skip "rank" and "XXX"

        # Find where artist ends and album begins (tricky!)
        # We'll try different splits and see which gives best match
        best_match_score = 0
        best_file = None

        for split_point in range(1, len(file_parts)):
            file_artist = '_'.join(file_parts[:split_point])
            file_album = '_'.join(file_parts[split_point:])

            norm_file_artist = normalize_for_matching(file_artist.replace('_', ' '))
            norm_file_album = normalize_for_matching(file_album.replace('_', ' '))

            # Calculate match score
            artist_match = target_artist in norm_file_artist or norm_file_artist in target_artist
            album_match = target_album in norm_file_album or norm_file_album in target_album

            if artist_match and album_match:
                # Calculate more precise score
                score = len(set(target_artist.split()) & set(norm_file_artist.split())) + \
                       len(set(target_album.split()) & set(norm_file_album.split()))

                if score > best_match_score:
                    best_match_score = score
                    best_file = cover_file

        # If we found a good match, return it
        if best_match_score >= 2:  # At least 2 word matches
            return best_file

    return None

def main():
    covers_dir = Path('covers')
    backup_dir = Path('covers_backup')

    if not covers_dir.exists():
        print("No covers directory found!")
        return

    # Create backup directory
    if backup_dir.exists():
        print("Backup directory already exists. Removing it...")
        shutil.rmtree(backup_dir)

    print("Creating backup of existing covers...")
    shutil.copytree(covers_dir, backup_dir)
    print(f"Backup created at {backup_dir}")

    # Get list of existing cover files
    existing_covers = [f for f in os.listdir(covers_dir) if f.endswith('.jpg')]
    print(f"Found {len(existing_covers)} existing cover files")

    # Load current CSV
    csv_file = 'top_500_albums_2023.csv'
    if not os.path.exists(csv_file):
        print(f"Error: {csv_file} not found!")
        return

    new_covers_dir = Path('covers_new')
    if new_covers_dir.exists():
        shutil.rmtree(new_covers_dir)
    new_covers_dir.mkdir()

    mapped_count = 0
    unmapped_count = 0
    unmapped_albums = []

    print("\nMapping covers to new rankings...")

    with open(csv_file, 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)

        for row in csv_reader:
            rank = row.get('Rank', '').strip()
            artist = row.get('Artist', '').strip()
            album = row.get('Album', '').strip()

            if not artist or not album:
                continue

            # Find matching existing cover
            matching_cover = find_matching_cover(artist, album, existing_covers)

            if matching_cover:
                # Create new filename with correct ranking
                safe_artist = sanitize_filename(artist)
                safe_album = sanitize_filename(album)
                new_filename = f"rank_{rank.zfill(3)}_{safe_artist}_{safe_album}.jpg"

                # Copy file with new name
                old_path = covers_dir / matching_cover
                new_path = new_covers_dir / new_filename

                shutil.copy2(old_path, new_path)
                mapped_count += 1

                if mapped_count % 50 == 0:
                    print(f"✓ Mapped {mapped_count} covers so far...")

            else:
                unmapped_count += 1
                unmapped_albums.append(f"{rank}. {artist} - {album}")
                print(f"✗ No cover found for: {rank}. {artist} - {album}")

    print(f"\n🎉 MAPPING RESULTS:")
    print(f"Successfully mapped: {mapped_count}")
    print(f"Could not map: {unmapped_count}")
    print(f"Total albums: {mapped_count + unmapped_count}")
    print(f"Success rate: {mapped_count/(mapped_count + unmapped_count)*100:.1f}%")

    if unmapped_albums:
        print(f"\n❌ Albums without covers ({len(unmapped_albums)}):")
        for album in unmapped_albums[:10]:
            print(f"  {album}")
        if len(unmapped_albums) > 10:
            print(f"  ... and {len(unmapped_albums) - 10} more")

    # Replace old covers directory with new one
    print(f"\nReplacing covers directory...")
    shutil.rmtree(covers_dir)
    shutil.move(new_covers_dir, covers_dir)

    print(f"✅ Cover remapping complete!")
    print(f"Original covers backed up to: {backup_dir}")
    print(f"New covers available in: {covers_dir}")

if __name__ == "__main__":
    main()