top500albums/compare_top500_albums.py

#!/usr/bin/env python3
"""
Compare Rolling Stone Top 500 Albums lists from 2020 and 2023 (Wikipedia).
Identifies new albums, removed albums, and ranking changes.
"""

import csv
import re
from typing import Dict, List, Tuple, Optional


def normalize_text(text: str) -> str:
    """Normalize text for comparison: lowercase, remove punctuation, extra spaces."""
    # Convert to lowercase
    text = text.lower()
    # Remove common punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    return text


def create_album_key(artist: str, album: str) -> str:
    """Create a normalized key for album matching."""
    # Normalize both artist and album
    norm_artist = normalize_text(artist)
    norm_album = normalize_text(album)

    # Handle common variations
    # Remove "the" from the beginning of artist names
    if norm_artist.startswith("the "):
        norm_artist = norm_artist[4:]

    # Handle "&" vs "and" in artist names
    norm_artist = norm_artist.replace(" and ", " ").replace(" & ", " ")

    # Create combined key
    return f"{norm_artist}|{norm_album}"


def fuzzy_match(key1: str, key2: str) -> bool:
    """Check if two album keys are similar enough to be considered the same."""
    # Exact match after normalization
    if key1 == key2:
        return True

    # Split into artist and album parts
    artist1, album1 = key1.split('|', 1)
    artist2, album2 = key2.split('|', 1)

    # Check if albums are very similar (allowing for minor variations)
    # Using simple string comparison - could use more sophisticated matching
    if artist1 == artist2:
        # Check album similarity
        words1 = set(album1.split())
        words2 = set(album2.split())
        # If most words match, consider it a match
        if len(words1 & words2) >= min(len(words1), len(words2)) * 0.8:
            return True

    return False


def find_album_in_2020(artist: str, album: str, albums_2020: Dict[str, Tuple[int, str, str]]) -> Optional[int]:
    """Find an album in the 2020 list, return its rank if found."""
    key_2023 = create_album_key(artist, album)

    # First try exact match
    if key_2023 in albums_2020:
        return albums_2020[key_2023][0]

    # Try fuzzy matching
    for key_2020, (rank, _, _) in albums_2020.items():
        if fuzzy_match(key_2023, key_2020):
            return rank

    return None


def main():
    # Read 2020 Rolling Stone list
    albums_2020 = {}  # key -> (rank, artist, album)

    print("Reading 2020 Rolling Stone list...")
    with open('rolling_stone_top_500_albums_2020.csv', 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rank = int(row['Rank'])
            artist = row['Artist']
            album = row['Album']
            key = create_album_key(artist, album)
            albums_2020[key] = (rank, artist, album)

    print(f"Loaded {len(albums_2020)} albums from 2020 list")

    # Read 2023 Wikipedia list and compare
    results = []

    print("\nReading 2023 Wikipedia list and comparing...")
    with open('wikipedia_top_500_albums.csv', 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rank_2023 = int(row['rank'])
            artist = row['artist'].strip()
            album = row['album'].strip()

            # Find in 2020 list
            rank_2020 = find_album_in_2020(artist, album, albums_2020)

            if rank_2020 is None:
                status = "New in 2023"
            else:
                change = rank_2020 - rank_2023
                if change == 0:
                    status = "No change"
                elif change > 0:
                    status = f"+{change}"  # Improved ranking (moved up)
                else:
                    status = str(change)  # Dropped ranking (moved down)

            results.append({
                'Rank': rank_2023,
                'Artist': artist,
                'Album': album,
                'Status': status
            })

    # Write results to CSV
    print("\nWriting results to top_500_albums_2023.csv...")
    with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['Rank', 'Artist', 'Album', 'Status']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)

    # Generate summary statistics
    new_albums = sum(1 for r in results if r['Status'] == "New in 2023")
    no_change = sum(1 for r in results if r['Status'] == "No change")
    improved = sum(1 for r in results if r['Status'].startswith('+'))
    dropped = sum(1 for r in results if r['Status'].startswith('-') and r['Status'] != "New in 2023")

    print("\nSummary:")
    print(f"Total albums in 2023 list: {len(results)}")
    print(f"New albums in 2023: {new_albums}")
    print(f"Albums with no ranking change: {no_change}")
    print(f"Albums that improved ranking: {improved}")
    print(f"Albums that dropped in ranking: {dropped}")

    # Find biggest movers
    biggest_improvements = []
    biggest_drops = []

    for r in results:
        if r['Status'].startswith('+'):
            change = int(r['Status'])
            biggest_improvements.append((change, r['Artist'], r['Album'], r['Rank']))
        elif r['Status'].startswith('-') and r['Status'] != "New in 2023":
            change = int(r['Status'])
            biggest_drops.append((change, r['Artist'], r['Album'], r['Rank']))

    biggest_improvements.sort(reverse=True)
    biggest_drops.sort()

    print("\nTop 5 biggest improvements:")
    for change, artist, album, rank in biggest_improvements[:5]:
        print(f"  {artist} - {album}: {change} (now at #{rank})")

    print("\nTop 5 biggest drops:")
    for change, artist, album, rank in biggest_drops[:5]:
        print(f"  {artist} - {album}: {change} (now at #{rank})")

    print(f"\nResults saved to: top_500_albums_2023.csv")


if __name__ == "__main__":
    main()