top500albums/scripts/compare_2020_vs_wikipedia.py

#!/usr/bin/env python3
"""
Compare 2020 Rolling Stone list against full 2023 data to find truly dropped albums.
"""

import csv

def normalize_text(text):
    """Normalize text for comparison"""
    return text.lower().strip().replace('&', 'and').replace('  ', ' ')

def main():
    # Read 2020 albums (simplified)
    albums_2020 = {}
    with open('rolling_stone_2020_simple.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            key = (normalize_text(row['Artist']), normalize_text(row['Album']))
            albums_2020[key] = {
                'rank': row['Rank'],
                'artist': row['Artist'],
                'album': row['Album']
            }

    print(f"📊 Loaded {len(albums_2020)} albums from 2020 list")

    # Read Wikipedia 2023 albums
    albums_2023_all = set()
    with open('wikipedia_top_500_albums.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            key = (normalize_text(row['artist']), normalize_text(row['album']))
            albums_2023_all.add(key)

    print(f"📊 Loaded {len(albums_2023_all)} albums from Wikipedia 2023 list")

    # Find dropped albums (in 2020 but not in complete 2023 list)
    dropped_albums = []
    for key, album_info in albums_2020.items():
        if key not in albums_2023_all:
            dropped_albums.append(album_info)

    # Sort by original 2020 rank
    dropped_albums.sort(key=lambda x: int(x['rank']))

    print(f"\n❌ Found {len(dropped_albums)} albums dropped from 2020 to 2023:")
    print("=" * 80)

    for album in dropped_albums:
        print(f"#{album['rank']:3s} - {album['artist']} - {album['album']}")

    print("=" * 80)
    print(f"\n📊 Summary:")
    print(f"   - Albums in 2020: {len(albums_2020)}")
    print(f"   - Albums in complete 2023: {len(albums_2023_all)}")
    print(f"   - Albums dropped: {len(dropped_albums)}")

    # Save dropped albums list
    with open('final_dropped_albums.csv', 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['Original_Rank_2020', 'Artist', 'Album']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for album in dropped_albums:
            writer.writerow({
                'Original_Rank_2020': album['rank'],
                'Artist': album['artist'],
                'Album': album['album']
            })

    print(f"\n💾 Saved final dropped list to: final_dropped_albums.csv")

if __name__ == "__main__":
    main()