#!/usr/bin/env python3 """ Find albums that are TRULY new in 2023 with better name matching. """ import csv import re def normalize_text(text): """Normalize text for comparison - more aggressive""" text = text.lower().strip() # Remove punctuation and common variations text = re.sub(r'[^\w\s]', '', text) # Remove all punctuation text = text.replace('and', '') text = text.replace('the', '') text = text.replace(' ', ' ') return text def main(): # Read 2020 albums with original data albums_2020_normalized = {} albums_2020_original = {} with open('rolling_stone_2020_simple.csv', 'r', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: key = (normalize_text(row['Artist']), normalize_text(row['Album'])) albums_2020_normalized[key] = row # Also store original for reference orig_key = (row['Artist'], row['Album']) albums_2020_original[orig_key] = row['Rank'] print(f"šŸ“Š Loaded {len(albums_2020_normalized)} albums from 2020 list") # Check specific cases print("\nšŸ” Checking specific albums:") test_cases = [ ("The Rolling Stones", "Exile on Main St."), ("The Beatles", "Sgt. Pepper's Lonely Hearts Club Band"), ("Beyonce", "Renaissance"), ("Taylor Swift", "Folklore"), ("Bad Bunny", "Un Verano Sin Ti") ] for artist, album in test_cases: norm_key = (normalize_text(artist), normalize_text(album)) found = norm_key in albums_2020_normalized print(f" {artist} - {album}: {'Found in 2020' if found else 'NOT in 2020'}") # Read 2023 albums and find truly new ones truly_new = [] incorrectly_marked_new = [] with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: rank = int(row['Rank']) if rank <= 500: key = (normalize_text(row['Artist']), normalize_text(row['Album'])) if row['Status'] == 'New in 2023': if key not in albums_2020_normalized: truly_new.append({ 'rank': row['Rank'], 'artist': row['Artist'], 'album': row['Album'] }) else: orig_2020 = albums_2020_normalized[key] incorrectly_marked_new.append({ 'rank': row['Rank'], 'artist': row['Artist'], 'album': row['Album'], 'rank_2020': orig_2020['Rank'] }) print(f"\nāœ… TRULY new albums in 2023 (not in 2020 list):") print("=" * 80) for album in truly_new: print(f"#{album['rank']:3s} - {album['artist']} - {album['album']}") print("=" * 80) print(f"Total truly new: {len(truly_new)}") print(f"\nāŒ Incorrectly marked as 'New in 2023' (were in 2020 list):") print("=" * 80) for album in incorrectly_marked_new[:20]: # Show first 20 print(f"#{album['rank']:3s} - {album['artist']} - {album['album']} (was #{album['rank_2020']} in 2020)") if len(incorrectly_marked_new) > 20: print(f"... and {len(incorrectly_marked_new) - 20} more") print("=" * 80) print(f"Total incorrectly marked: {len(incorrectly_marked_new)}") # Calculate correct numbers print(f"\nšŸ“Š Final Summary:") print(f" - Albums marked 'New in 2023': {len(truly_new) + len(incorrectly_marked_new)}") print(f" - Actually new (not in 2020): {len(truly_new)}") print(f" - Incorrectly marked as new: {len(incorrectly_marked_new)}") print(f" - Total dropped from 2020: Should be {len(truly_new)} to maintain 500 total") if __name__ == "__main__": main()