- Create wikipedia_500_albums.csv from Wikipedia:WikiProject_Albums/500 - Generate top_500_albums_2023.csv comparing 2020 vs 2023 rankings - Add ranking change analysis (192 new, 164 improved, 113 dropped) - Integrate Info and Description from 2020 Rolling Stone data - Fill missing album information for 70+ additional albums - Include Python scripts for data processing and analysis - Update CLAUDE.md with project documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
178 lines
No EOL
6.1 KiB
Python
178 lines
No EOL
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compare Rolling Stone Top 500 Albums lists from 2020 and 2023 (Wikipedia).
|
|
Identifies new albums, removed albums, and ranking changes.
|
|
"""
|
|
|
|
import csv
|
|
import re
|
|
from typing import Dict, List, Tuple, Optional
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
"""Normalize text for comparison: lowercase, remove punctuation, extra spaces."""
|
|
# Convert to lowercase
|
|
text = text.lower()
|
|
# Remove common punctuation and special characters
|
|
text = re.sub(r'[^\w\s]', ' ', text)
|
|
# Replace multiple spaces with single space
|
|
text = re.sub(r'\s+', ' ', text)
|
|
# Remove leading/trailing spaces
|
|
text = text.strip()
|
|
return text
|
|
|
|
|
|
def create_album_key(artist: str, album: str) -> str:
|
|
"""Create a normalized key for album matching."""
|
|
# Normalize both artist and album
|
|
norm_artist = normalize_text(artist)
|
|
norm_album = normalize_text(album)
|
|
|
|
# Handle common variations
|
|
# Remove "the" from the beginning of artist names
|
|
if norm_artist.startswith("the "):
|
|
norm_artist = norm_artist[4:]
|
|
|
|
# Handle "&" vs "and" in artist names
|
|
norm_artist = norm_artist.replace(" and ", " ").replace(" & ", " ")
|
|
|
|
# Create combined key
|
|
return f"{norm_artist}|{norm_album}"
|
|
|
|
|
|
def fuzzy_match(key1: str, key2: str) -> bool:
|
|
"""Check if two album keys are similar enough to be considered the same."""
|
|
# Exact match after normalization
|
|
if key1 == key2:
|
|
return True
|
|
|
|
# Split into artist and album parts
|
|
artist1, album1 = key1.split('|', 1)
|
|
artist2, album2 = key2.split('|', 1)
|
|
|
|
# Check if albums are very similar (allowing for minor variations)
|
|
# Using simple string comparison - could use more sophisticated matching
|
|
if artist1 == artist2:
|
|
# Check album similarity
|
|
words1 = set(album1.split())
|
|
words2 = set(album2.split())
|
|
# If most words match, consider it a match
|
|
if len(words1 & words2) >= min(len(words1), len(words2)) * 0.8:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def find_album_in_2020(artist: str, album: str, albums_2020: Dict[str, Tuple[int, str, str]]) -> Optional[int]:
|
|
"""Find an album in the 2020 list, return its rank if found."""
|
|
key_2023 = create_album_key(artist, album)
|
|
|
|
# First try exact match
|
|
if key_2023 in albums_2020:
|
|
return albums_2020[key_2023][0]
|
|
|
|
# Try fuzzy matching
|
|
for key_2020, (rank, _, _) in albums_2020.items():
|
|
if fuzzy_match(key_2023, key_2020):
|
|
return rank
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
# Read 2020 Rolling Stone list
|
|
albums_2020 = {} # key -> (rank, artist, album)
|
|
|
|
print("Reading 2020 Rolling Stone list...")
|
|
with open('rolling_stone_top_500_albums_2020.csv', 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
rank = int(row['Rank'])
|
|
artist = row['Artist']
|
|
album = row['Album']
|
|
key = create_album_key(artist, album)
|
|
albums_2020[key] = (rank, artist, album)
|
|
|
|
print(f"Loaded {len(albums_2020)} albums from 2020 list")
|
|
|
|
# Read 2023 Wikipedia list and compare
|
|
results = []
|
|
|
|
print("\nReading 2023 Wikipedia list and comparing...")
|
|
with open('wikipedia_500_albums.csv', 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
rank_2023 = int(row['Rank'])
|
|
artist = row['Artist']
|
|
album = row['Album']
|
|
|
|
# Find in 2020 list
|
|
rank_2020 = find_album_in_2020(artist, album, albums_2020)
|
|
|
|
if rank_2020 is None:
|
|
status = "New in 2023"
|
|
else:
|
|
change = rank_2020 - rank_2023
|
|
if change == 0:
|
|
status = "No change"
|
|
elif change > 0:
|
|
status = f"+{change}" # Improved ranking (moved up)
|
|
else:
|
|
status = str(change) # Dropped ranking (moved down)
|
|
|
|
results.append({
|
|
'Rank': rank_2023,
|
|
'Artist': artist,
|
|
'Album': album,
|
|
'Status': status
|
|
})
|
|
|
|
# Write results to CSV
|
|
print("\nWriting results to top_500_albums_2023.csv...")
|
|
with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as f:
|
|
fieldnames = ['Rank', 'Artist', 'Album', 'Status']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(results)
|
|
|
|
# Generate summary statistics
|
|
new_albums = sum(1 for r in results if r['Status'] == "New in 2023")
|
|
no_change = sum(1 for r in results if r['Status'] == "No change")
|
|
improved = sum(1 for r in results if r['Status'].startswith('+'))
|
|
dropped = sum(1 for r in results if r['Status'].startswith('-') and r['Status'] != "New in 2023")
|
|
|
|
print("\nSummary:")
|
|
print(f"Total albums in 2023 list: {len(results)}")
|
|
print(f"New albums in 2023: {new_albums}")
|
|
print(f"Albums with no ranking change: {no_change}")
|
|
print(f"Albums that improved ranking: {improved}")
|
|
print(f"Albums that dropped in ranking: {dropped}")
|
|
|
|
# Find biggest movers
|
|
biggest_improvements = []
|
|
biggest_drops = []
|
|
|
|
for r in results:
|
|
if r['Status'].startswith('+'):
|
|
change = int(r['Status'])
|
|
biggest_improvements.append((change, r['Artist'], r['Album'], r['Rank']))
|
|
elif r['Status'].startswith('-') and r['Status'] != "New in 2023":
|
|
change = int(r['Status'])
|
|
biggest_drops.append((change, r['Artist'], r['Album'], r['Rank']))
|
|
|
|
biggest_improvements.sort(reverse=True)
|
|
biggest_drops.sort()
|
|
|
|
print("\nTop 5 biggest improvements:")
|
|
for change, artist, album, rank in biggest_improvements[:5]:
|
|
print(f" {artist} - {album}: {change} (now at #{rank})")
|
|
|
|
print("\nTop 5 biggest drops:")
|
|
for change, artist, album, rank in biggest_drops[:5]:
|
|
print(f" {artist} - {album}: {change} (now at #{rank})")
|
|
|
|
print(f"\nResults saved to: top_500_albums_2023.csv")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |