top500albums/scripts/find_truly_new_fixed.py

#!/usr/bin/env python3
"""
Find albums that are TRULY new in 2023 with better name matching.
"""

import csv
import re

def normalize_text(text):
    """Normalize text for comparison - more aggressive"""
    text = text.lower().strip()
    # Remove punctuation and common variations
    text = re.sub(r'[^\w\s]', '', text)  # Remove all punctuation
    text = text.replace('and', '')
    text = text.replace('the', '')
    text = text.replace('  ', ' ')
    return text

def main():
    # Read 2020 albums with original data
    albums_2020_normalized = {}
    albums_2020_original = {}

    with open('rolling_stone_2020_simple.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            key = (normalize_text(row['Artist']), normalize_text(row['Album']))
            albums_2020_normalized[key] = row
            # Also store original for reference
            orig_key = (row['Artist'], row['Album'])
            albums_2020_original[orig_key] = row['Rank']

    print(f"📊 Loaded {len(albums_2020_normalized)} albums from 2020 list")

    # Check specific cases
    print("\n🔍 Checking specific albums:")
    test_cases = [
        ("The Rolling Stones", "Exile on Main St."),
        ("The Beatles", "Sgt. Pepper's Lonely Hearts Club Band"),
        ("Beyonce", "Renaissance"),
        ("Taylor Swift", "Folklore"),
        ("Bad Bunny", "Un Verano Sin Ti")
    ]

    for artist, album in test_cases:
        norm_key = (normalize_text(artist), normalize_text(album))
        found = norm_key in albums_2020_normalized
        print(f"   {artist} - {album}: {'Found in 2020' if found else 'NOT in 2020'}")

    # Read 2023 albums and find truly new ones
    truly_new = []
    incorrectly_marked_new = []

    with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            rank = int(row['Rank'])
            if rank <= 500:
                key = (normalize_text(row['Artist']), normalize_text(row['Album']))

                if row['Status'] == 'New in 2023':
                    if key not in albums_2020_normalized:
                        truly_new.append({
                            'rank': row['Rank'],
                            'artist': row['Artist'],
                            'album': row['Album']
                        })
                    else:
                        orig_2020 = albums_2020_normalized[key]
                        incorrectly_marked_new.append({
                            'rank': row['Rank'],
                            'artist': row['Artist'],
                            'album': row['Album'],
                            'rank_2020': orig_2020['Rank']
                        })

    print(f"\n✅ TRULY new albums in 2023 (not in 2020 list):")
    print("=" * 80)
    for album in truly_new:
        print(f"#{album['rank']:3s} - {album['artist']} - {album['album']}")
    print("=" * 80)
    print(f"Total truly new: {len(truly_new)}")

    print(f"\n❌ Incorrectly marked as 'New in 2023' (were in 2020 list):")
    print("=" * 80)
    for album in incorrectly_marked_new[:20]:  # Show first 20
        print(f"#{album['rank']:3s} - {album['artist']} - {album['album']} (was #{album['rank_2020']} in 2020)")
    if len(incorrectly_marked_new) > 20:
        print(f"... and {len(incorrectly_marked_new) - 20} more")
    print("=" * 80)
    print(f"Total incorrectly marked: {len(incorrectly_marked_new)}")

    # Calculate correct numbers
    print(f"\n📊 Final Summary:")
    print(f"   - Albums marked 'New in 2023': {len(truly_new) + len(incorrectly_marked_new)}")
    print(f"   - Actually new (not in 2020): {len(truly_new)}")
    print(f"   - Incorrectly marked as new: {len(incorrectly_marked_new)}")
    print(f"   - Total dropped from 2020: Should be {len(truly_new)} to maintain 500 total")

if __name__ == "__main__":
    main()