top500albums/scripts/find_truly_new_fixed.py
Johan Lundberg c3a24799c8 Complete dropped albums list with all 89 truly dropped albums from 2020
- Added all 89 albums that were genuinely dropped from 2020 to 2023
- Fixed incorrect status markings (many albums marked "New in 2023" were not new)
- Removed duplicates and albums incorrectly marked as dropped
- Final count: 589 total (500 main list + 89 dropped)
- Updated JavaScript validation for extended range
- Created comprehensive analysis scripts to verify data

Math now adds up correctly: 89 albums dropped to make room for new additions

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-01 01:14:06 +02:00

101 lines
No EOL
3.9 KiB
Python

#!/usr/bin/env python3
"""
Find albums that are TRULY new in 2023 with better name matching.
"""
import csv
import re
def normalize_text(text):
"""Normalize text for comparison - more aggressive"""
text = text.lower().strip()
# Remove punctuation and common variations
text = re.sub(r'[^\w\s]', '', text) # Remove all punctuation
text = text.replace('and', '')
text = text.replace('the', '')
text = text.replace(' ', ' ')
return text
def main():
# Read 2020 albums with original data
albums_2020_normalized = {}
albums_2020_original = {}
with open('rolling_stone_2020_simple.csv', 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
key = (normalize_text(row['Artist']), normalize_text(row['Album']))
albums_2020_normalized[key] = row
# Also store original for reference
orig_key = (row['Artist'], row['Album'])
albums_2020_original[orig_key] = row['Rank']
print(f"📊 Loaded {len(albums_2020_normalized)} albums from 2020 list")
# Check specific cases
print("\n🔍 Checking specific albums:")
test_cases = [
("The Rolling Stones", "Exile on Main St."),
("The Beatles", "Sgt. Pepper's Lonely Hearts Club Band"),
("Beyonce", "Renaissance"),
("Taylor Swift", "Folklore"),
("Bad Bunny", "Un Verano Sin Ti")
]
for artist, album in test_cases:
norm_key = (normalize_text(artist), normalize_text(album))
found = norm_key in albums_2020_normalized
print(f" {artist} - {album}: {'Found in 2020' if found else 'NOT in 2020'}")
# Read 2023 albums and find truly new ones
truly_new = []
incorrectly_marked_new = []
with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
rank = int(row['Rank'])
if rank <= 500:
key = (normalize_text(row['Artist']), normalize_text(row['Album']))
if row['Status'] == 'New in 2023':
if key not in albums_2020_normalized:
truly_new.append({
'rank': row['Rank'],
'artist': row['Artist'],
'album': row['Album']
})
else:
orig_2020 = albums_2020_normalized[key]
incorrectly_marked_new.append({
'rank': row['Rank'],
'artist': row['Artist'],
'album': row['Album'],
'rank_2020': orig_2020['Rank']
})
print(f"\n✅ TRULY new albums in 2023 (not in 2020 list):")
print("=" * 80)
for album in truly_new:
print(f"#{album['rank']:3s} - {album['artist']} - {album['album']}")
print("=" * 80)
print(f"Total truly new: {len(truly_new)}")
print(f"\n❌ Incorrectly marked as 'New in 2023' (were in 2020 list):")
print("=" * 80)
for album in incorrectly_marked_new[:20]: # Show first 20
print(f"#{album['rank']:3s} - {album['artist']} - {album['album']} (was #{album['rank_2020']} in 2020)")
if len(incorrectly_marked_new) > 20:
print(f"... and {len(incorrectly_marked_new) - 20} more")
print("=" * 80)
print(f"Total incorrectly marked: {len(incorrectly_marked_new)}")
# Calculate correct numbers
print(f"\n📊 Final Summary:")
print(f" - Albums marked 'New in 2023': {len(truly_new) + len(incorrectly_marked_new)}")
print(f" - Actually new (not in 2020): {len(truly_new)}")
print(f" - Incorrectly marked as new: {len(incorrectly_marked_new)}")
print(f" - Total dropped from 2020: Should be {len(truly_new)} to maintain 500 total")
if __name__ == "__main__":
main()