top500albums/compare_top500_albums.py
Johan Lundberg 462fdcfa84 Complete Top 500 Albums project with 100% data coverage and UI improvements
- Fixed Info/Description columns after regenerating CSV with clean Wikipedia data
- Remapped and downloaded missing album covers to match new rankings
- Modified website UI to show all description text without click-to-expand
- Added comprehensive Info/Description for all 500 albums using research
- Created multiple data processing scripts for album information completion
- Achieved 100% data completion with descriptions ending "(by Claude)" for new content
- All albums now have complete metadata and cover art

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-01 00:33:47 +02:00

178 lines
No EOL
6.1 KiB
Python

#!/usr/bin/env python3
"""
Compare Rolling Stone Top 500 Albums lists from 2020 and 2023 (Wikipedia).
Identifies new albums, removed albums, and ranking changes.
"""
import csv
import re
from typing import Dict, List, Tuple, Optional
def normalize_text(text: str) -> str:
"""Normalize text for comparison: lowercase, remove punctuation, extra spaces."""
# Convert to lowercase
text = text.lower()
# Remove common punctuation and special characters
text = re.sub(r'[^\w\s]', ' ', text)
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing spaces
text = text.strip()
return text
def create_album_key(artist: str, album: str) -> str:
"""Create a normalized key for album matching."""
# Normalize both artist and album
norm_artist = normalize_text(artist)
norm_album = normalize_text(album)
# Handle common variations
# Remove "the" from the beginning of artist names
if norm_artist.startswith("the "):
norm_artist = norm_artist[4:]
# Handle "&" vs "and" in artist names
norm_artist = norm_artist.replace(" and ", " ").replace(" & ", " ")
# Create combined key
return f"{norm_artist}|{norm_album}"
def fuzzy_match(key1: str, key2: str) -> bool:
"""Check if two album keys are similar enough to be considered the same."""
# Exact match after normalization
if key1 == key2:
return True
# Split into artist and album parts
artist1, album1 = key1.split('|', 1)
artist2, album2 = key2.split('|', 1)
# Check if albums are very similar (allowing for minor variations)
# Using simple string comparison - could use more sophisticated matching
if artist1 == artist2:
# Check album similarity
words1 = set(album1.split())
words2 = set(album2.split())
# If most words match, consider it a match
if len(words1 & words2) >= min(len(words1), len(words2)) * 0.8:
return True
return False
def find_album_in_2020(artist: str, album: str, albums_2020: Dict[str, Tuple[int, str, str]]) -> Optional[int]:
"""Find an album in the 2020 list, return its rank if found."""
key_2023 = create_album_key(artist, album)
# First try exact match
if key_2023 in albums_2020:
return albums_2020[key_2023][0]
# Try fuzzy matching
for key_2020, (rank, _, _) in albums_2020.items():
if fuzzy_match(key_2023, key_2020):
return rank
return None
def main():
# Read 2020 Rolling Stone list
albums_2020 = {} # key -> (rank, artist, album)
print("Reading 2020 Rolling Stone list...")
with open('rolling_stone_top_500_albums_2020.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
rank = int(row['Rank'])
artist = row['Artist']
album = row['Album']
key = create_album_key(artist, album)
albums_2020[key] = (rank, artist, album)
print(f"Loaded {len(albums_2020)} albums from 2020 list")
# Read 2023 Wikipedia list and compare
results = []
print("\nReading 2023 Wikipedia list and comparing...")
with open('wikipedia_top_500_albums.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
rank_2023 = int(row['rank'])
artist = row['artist'].strip()
album = row['album'].strip()
# Find in 2020 list
rank_2020 = find_album_in_2020(artist, album, albums_2020)
if rank_2020 is None:
status = "New in 2023"
else:
change = rank_2020 - rank_2023
if change == 0:
status = "No change"
elif change > 0:
status = f"+{change}" # Improved ranking (moved up)
else:
status = str(change) # Dropped ranking (moved down)
results.append({
'Rank': rank_2023,
'Artist': artist,
'Album': album,
'Status': status
})
# Write results to CSV
print("\nWriting results to top_500_albums_2023.csv...")
with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as f:
fieldnames = ['Rank', 'Artist', 'Album', 'Status']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
# Generate summary statistics
new_albums = sum(1 for r in results if r['Status'] == "New in 2023")
no_change = sum(1 for r in results if r['Status'] == "No change")
improved = sum(1 for r in results if r['Status'].startswith('+'))
dropped = sum(1 for r in results if r['Status'].startswith('-') and r['Status'] != "New in 2023")
print("\nSummary:")
print(f"Total albums in 2023 list: {len(results)}")
print(f"New albums in 2023: {new_albums}")
print(f"Albums with no ranking change: {no_change}")
print(f"Albums that improved ranking: {improved}")
print(f"Albums that dropped in ranking: {dropped}")
# Find biggest movers
biggest_improvements = []
biggest_drops = []
for r in results:
if r['Status'].startswith('+'):
change = int(r['Status'])
biggest_improvements.append((change, r['Artist'], r['Album'], r['Rank']))
elif r['Status'].startswith('-') and r['Status'] != "New in 2023":
change = int(r['Status'])
biggest_drops.append((change, r['Artist'], r['Album'], r['Rank']))
biggest_improvements.sort(reverse=True)
biggest_drops.sort()
print("\nTop 5 biggest improvements:")
for change, artist, album, rank in biggest_improvements[:5]:
print(f" {artist} - {album}: {change} (now at #{rank})")
print("\nTop 5 biggest drops:")
for change, artist, album, rank in biggest_drops[:5]:
print(f" {artist} - {album}: {change} (now at #{rank})")
print(f"\nResults saved to: top_500_albums_2023.csv")
if __name__ == "__main__":
main()