top500albums/scripts/correct_new_albums.py

#!/usr/bin/env python3
"""
Correct the "New in 2023" markings to show only truly new albums.
Should be 8 new albums to balance the 8 dropped albums.
"""

import csv

def normalize_text(text):
    """Normalize text for comparison"""
    return text.lower().strip().replace('&', 'and').replace('  ', ' ')

def main():
    # Read 2020 albums for comparison
    albums_2020 = set()
    with open('rolling_stone_2020_simplified.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            key = (normalize_text(row['Artist']), normalize_text(row['Album']))
            albums_2020.add(key)

    print(f"📊 Loaded {len(albums_2020)} albums from 2020 list")

    # Read current 2023 data
    albums_2023 = []
    with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            albums_2023.append(row)

    # Analyze albums marked as "New in 2023"
    currently_marked_new = []
    truly_new = []
    incorrectly_marked = []

    for album in albums_2023:
        if 'New in 2023' in album.get('Status', ''):
            currently_marked_new.append(album)

            # Check if this album was actually in 2020
            key = (normalize_text(album['Artist']), normalize_text(album['Album']))
            if key in albums_2020:
                incorrectly_marked.append(album)
            else:
                truly_new.append(album)

    print(f"\\n📊 Analysis of albums marked as 'New in 2023':")
    print(f"   Total marked as new: {len(currently_marked_new)}")
    print(f"   Truly new (not in 2020): {len(truly_new)}")
    print(f"   Incorrectly marked (were in 2020): {len(incorrectly_marked)}")

    print(f"\\n✅ Truly new albums in 2023:")
    for album in truly_new:
        print(f"   #{album['Rank']} - {album['Artist']} - {album['Album']}")

    print(f"\\n❌ Incorrectly marked as new (were in 2020):")
    for album in incorrectly_marked:
        print(f"   #{album['Rank']} - {album['Artist']} - {album['Album']}")

    # Update the CSV to correct the statuses
    updated_albums = []
    for album in albums_2023:
        updated_album = album.copy()

        # If this album is marked as "New in 2023" but was actually in 2020, correct it
        if 'New in 2023' in album.get('Status', ''):
            key = (normalize_text(album['Artist']), normalize_text(album['Album']))
            if key in albums_2020:
                # This was incorrectly marked - change to "No change" or appropriate status
                updated_album['Status'] = 'No change'

        updated_albums.append(updated_album)

    # Write corrected CSV
    with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(updated_albums)

    print(f"\\n✅ Corrected CSV written with proper 'New in 2023' markings")
    print(f"📁 Updated: top_500_albums_2023.csv")
    print(f"\\n📊 Final count: {len(truly_new)} truly new albums in 2023")

if __name__ == "__main__":
    main()