Add comprehensive Top 500 Albums analysis with Wikipedia data integration

- Create wikipedia_500_albums.csv from Wikipedia:WikiProject_Albums/500 - Generate top_500_albums_2023.csv comparing 2020 vs 2023 rankings - Add ranking change analysis (192 new, 164 improved, 113 dropped) - Integrate Info and Description from 2020 Rolling Stone data - Fill missing album information for 70+ additional albums - Include Python scripts for data processing and analysis - Update CLAUDE.md with project documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-30 21:17:25 +02:00 · 2025-06-30 21:17:25 +02:00 · 97ea973de0
commit 97ea973de0
parent 49ce813e59
8 changed files with 2607 additions and 0 deletions
--- a/compare_top500_albums.py
+++ b/compare_top500_albums.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Compare Rolling Stone Top 500 Albums lists from 2020 and 2023 (Wikipedia).
+Identifies new albums, removed albums, and ranking changes.
+"""
+
+import csv
+import re
+from typing import Dict, List, Tuple, Optional
+
+
+def normalize_text(text: str) -> str:
+    """Normalize text for comparison: lowercase, remove punctuation, extra spaces."""
+    # Convert to lowercase
+    text = text.lower()
+    # Remove common punctuation and special characters
+    text = re.sub(r'[^\w\s]', ' ', text)
+    # Replace multiple spaces with single space
+    text = re.sub(r'\s+', ' ', text)
+    # Remove leading/trailing spaces
+    text = text.strip()
+    return text
+
+
+def create_album_key(artist: str, album: str) -> str:
+    """Create a normalized key for album matching."""
+    # Normalize both artist and album
+    norm_artist = normalize_text(artist)
+    norm_album = normalize_text(album)
+    
+    # Handle common variations
+    # Remove "the" from the beginning of artist names
+    if norm_artist.startswith("the "):
+        norm_artist = norm_artist[4:]
+    
+    # Handle "&" vs "and" in artist names
+    norm_artist = norm_artist.replace(" and ", " ").replace(" & ", " ")
+    
+    # Create combined key
+    return f"{norm_artist}|{norm_album}"
+
+
+def fuzzy_match(key1: str, key2: str) -> bool:
+    """Check if two album keys are similar enough to be considered the same."""
+    # Exact match after normalization
+    if key1 == key2:
+        return True
+    
+    # Split into artist and album parts
+    artist1, album1 = key1.split('|', 1)
+    artist2, album2 = key2.split('|', 1)
+    
+    # Check if albums are very similar (allowing for minor variations)
+    # Using simple string comparison - could use more sophisticated matching
+    if artist1 == artist2:
+        # Check album similarity
+        words1 = set(album1.split())
+        words2 = set(album2.split())
+        # If most words match, consider it a match
+        if len(words1 & words2) >= min(len(words1), len(words2)) * 0.8:
+            return True
+    
+    return False
+
+
+def find_album_in_2020(artist: str, album: str, albums_2020: Dict[str, Tuple[int, str, str]]) -> Optional[int]:
+    """Find an album in the 2020 list, return its rank if found."""
+    key_2023 = create_album_key(artist, album)
+    
+    # First try exact match
+    if key_2023 in albums_2020:
+        return albums_2020[key_2023][0]
+    
+    # Try fuzzy matching
+    for key_2020, (rank, _, _) in albums_2020.items():
+        if fuzzy_match(key_2023, key_2020):
+            return rank
+    
+    return None
+
+
+def main():
+    # Read 2020 Rolling Stone list
+    albums_2020 = {}  # key -> (rank, artist, album)
+    
+    print("Reading 2020 Rolling Stone list...")
+    with open('rolling_stone_top_500_albums_2020.csv', 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            rank = int(row['Rank'])
+            artist = row['Artist']
+            album = row['Album']
+            key = create_album_key(artist, album)
+            albums_2020[key] = (rank, artist, album)
+    
+    print(f"Loaded {len(albums_2020)} albums from 2020 list")
+    
+    # Read 2023 Wikipedia list and compare
+    results = []
+    
+    print("\nReading 2023 Wikipedia list and comparing...")
+    with open('wikipedia_500_albums.csv', 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            rank_2023 = int(row['Rank'])
+            artist = row['Artist']
+            album = row['Album']
+            
+            # Find in 2020 list
+            rank_2020 = find_album_in_2020(artist, album, albums_2020)
+            
+            if rank_2020 is None:
+                status = "New in 2023"
+            else:
+                change = rank_2020 - rank_2023
+                if change == 0:
+                    status = "No change"
+                elif change > 0:
+                    status = f"+{change}"  # Improved ranking (moved up)
+                else:
+                    status = str(change)  # Dropped ranking (moved down)
+            
+            results.append({
+                'Rank': rank_2023,
+                'Artist': artist,
+                'Album': album,
+                'Status': status
+            })
+    
+    # Write results to CSV
+    print("\nWriting results to top_500_albums_2023.csv...")
+    with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as f:
+        fieldnames = ['Rank', 'Artist', 'Album', 'Status']
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+    
+    # Generate summary statistics
+    new_albums = sum(1 for r in results if r['Status'] == "New in 2023")
+    no_change = sum(1 for r in results if r['Status'] == "No change")
+    improved = sum(1 for r in results if r['Status'].startswith('+'))
+    dropped = sum(1 for r in results if r['Status'].startswith('-') and r['Status'] != "New in 2023")
+    
+    print("\nSummary:")
+    print(f"Total albums in 2023 list: {len(results)}")
+    print(f"New albums in 2023: {new_albums}")
+    print(f"Albums with no ranking change: {no_change}")
+    print(f"Albums that improved ranking: {improved}")
+    print(f"Albums that dropped in ranking: {dropped}")
+    
+    # Find biggest movers
+    biggest_improvements = []
+    biggest_drops = []
+    
+    for r in results:
+        if r['Status'].startswith('+'):
+            change = int(r['Status'])
+            biggest_improvements.append((change, r['Artist'], r['Album'], r['Rank']))
+        elif r['Status'].startswith('-') and r['Status'] != "New in 2023":
+            change = int(r['Status'])
+            biggest_drops.append((change, r['Artist'], r['Album'], r['Rank']))
+    
+    biggest_improvements.sort(reverse=True)
+    biggest_drops.sort()
+    
+    print("\nTop 5 biggest improvements:")
+    for change, artist, album, rank in biggest_improvements[:5]:
+        print(f"  {artist} - {album}: {change} (now at #{rank})")
+    
+    print("\nTop 5 biggest drops:")
+    for change, artist, album, rank in biggest_drops[:5]:
+        print(f"  {artist} - {album}: {change} (now at #{rank})")
+    
+    print(f"\nResults saved to: top_500_albums_2023.csv")
+
+
+if __name__ == "__main__":
+    main()