Add comprehensive Top 500 Albums analysis with Wikipedia data integration

- Create wikipedia_500_albums.csv from Wikipedia:WikiProject_Albums/500
- Generate top_500_albums_2023.csv comparing 2020 vs 2023 rankings
- Add ranking change analysis (192 new, 164 improved, 113 dropped)
- Integrate Info and Description from 2020 Rolling Stone data
- Fill missing album information for 70+ additional albums
- Include Python scripts for data processing and analysis
- Update CLAUDE.md with project documentation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Johan Lundberg 2025-06-30 21:17:25 +02:00
parent 49ce813e59
commit 97ea973de0
8 changed files with 2607 additions and 0 deletions

178
compare_top500_albums.py Normal file
View file

@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
Compare Rolling Stone Top 500 Albums lists from 2020 and 2023 (Wikipedia).
Identifies new albums, removed albums, and ranking changes.
"""
import csv
import re
from typing import Dict, List, Tuple, Optional
def normalize_text(text: str) -> str:
"""Normalize text for comparison: lowercase, remove punctuation, extra spaces."""
# Convert to lowercase
text = text.lower()
# Remove common punctuation and special characters
text = re.sub(r'[^\w\s]', ' ', text)
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing spaces
text = text.strip()
return text
def create_album_key(artist: str, album: str) -> str:
"""Create a normalized key for album matching."""
# Normalize both artist and album
norm_artist = normalize_text(artist)
norm_album = normalize_text(album)
# Handle common variations
# Remove "the" from the beginning of artist names
if norm_artist.startswith("the "):
norm_artist = norm_artist[4:]
# Handle "&" vs "and" in artist names
norm_artist = norm_artist.replace(" and ", " ").replace(" & ", " ")
# Create combined key
return f"{norm_artist}|{norm_album}"
def fuzzy_match(key1: str, key2: str) -> bool:
"""Check if two album keys are similar enough to be considered the same."""
# Exact match after normalization
if key1 == key2:
return True
# Split into artist and album parts
artist1, album1 = key1.split('|', 1)
artist2, album2 = key2.split('|', 1)
# Check if albums are very similar (allowing for minor variations)
# Using simple string comparison - could use more sophisticated matching
if artist1 == artist2:
# Check album similarity
words1 = set(album1.split())
words2 = set(album2.split())
# If most words match, consider it a match
if len(words1 & words2) >= min(len(words1), len(words2)) * 0.8:
return True
return False
def find_album_in_2020(artist: str, album: str, albums_2020: Dict[str, Tuple[int, str, str]]) -> Optional[int]:
"""Find an album in the 2020 list, return its rank if found."""
key_2023 = create_album_key(artist, album)
# First try exact match
if key_2023 in albums_2020:
return albums_2020[key_2023][0]
# Try fuzzy matching
for key_2020, (rank, _, _) in albums_2020.items():
if fuzzy_match(key_2023, key_2020):
return rank
return None
def main():
# Read 2020 Rolling Stone list
albums_2020 = {} # key -> (rank, artist, album)
print("Reading 2020 Rolling Stone list...")
with open('rolling_stone_top_500_albums_2020.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
rank = int(row['Rank'])
artist = row['Artist']
album = row['Album']
key = create_album_key(artist, album)
albums_2020[key] = (rank, artist, album)
print(f"Loaded {len(albums_2020)} albums from 2020 list")
# Read 2023 Wikipedia list and compare
results = []
print("\nReading 2023 Wikipedia list and comparing...")
with open('wikipedia_500_albums.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
rank_2023 = int(row['Rank'])
artist = row['Artist']
album = row['Album']
# Find in 2020 list
rank_2020 = find_album_in_2020(artist, album, albums_2020)
if rank_2020 is None:
status = "New in 2023"
else:
change = rank_2020 - rank_2023
if change == 0:
status = "No change"
elif change > 0:
status = f"+{change}" # Improved ranking (moved up)
else:
status = str(change) # Dropped ranking (moved down)
results.append({
'Rank': rank_2023,
'Artist': artist,
'Album': album,
'Status': status
})
# Write results to CSV
print("\nWriting results to top_500_albums_2023.csv...")
with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as f:
fieldnames = ['Rank', 'Artist', 'Album', 'Status']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
# Generate summary statistics
new_albums = sum(1 for r in results if r['Status'] == "New in 2023")
no_change = sum(1 for r in results if r['Status'] == "No change")
improved = sum(1 for r in results if r['Status'].startswith('+'))
dropped = sum(1 for r in results if r['Status'].startswith('-') and r['Status'] != "New in 2023")
print("\nSummary:")
print(f"Total albums in 2023 list: {len(results)}")
print(f"New albums in 2023: {new_albums}")
print(f"Albums with no ranking change: {no_change}")
print(f"Albums that improved ranking: {improved}")
print(f"Albums that dropped in ranking: {dropped}")
# Find biggest movers
biggest_improvements = []
biggest_drops = []
for r in results:
if r['Status'].startswith('+'):
change = int(r['Status'])
biggest_improvements.append((change, r['Artist'], r['Album'], r['Rank']))
elif r['Status'].startswith('-') and r['Status'] != "New in 2023":
change = int(r['Status'])
biggest_drops.append((change, r['Artist'], r['Album'], r['Rank']))
biggest_improvements.sort(reverse=True)
biggest_drops.sort()
print("\nTop 5 biggest improvements:")
for change, artist, album, rank in biggest_improvements[:5]:
print(f" {artist} - {album}: {change} (now at #{rank})")
print("\nTop 5 biggest drops:")
for change, artist, album, rank in biggest_drops[:5]:
print(f" {artist} - {album}: {change} (now at #{rank})")
print(f"\nResults saved to: top_500_albums_2023.csv")
if __name__ == "__main__":
main()