#!/usr/bin/env python3 """ Compare Rolling Stone Top 500 Albums lists from 2020 and 2023 (Wikipedia). Identifies new albums, removed albums, and ranking changes. """ import csv import re from typing import Dict, List, Tuple, Optional def normalize_text(text: str) -> str: """Normalize text for comparison: lowercase, remove punctuation, extra spaces.""" # Convert to lowercase text = text.lower() # Remove common punctuation and special characters text = re.sub(r'[^\w\s]', ' ', text) # Replace multiple spaces with single space text = re.sub(r'\s+', ' ', text) # Remove leading/trailing spaces text = text.strip() return text def create_album_key(artist: str, album: str) -> str: """Create a normalized key for album matching.""" # Normalize both artist and album norm_artist = normalize_text(artist) norm_album = normalize_text(album) # Handle common variations # Remove "the" from the beginning of artist names if norm_artist.startswith("the "): norm_artist = norm_artist[4:] # Handle "&" vs "and" in artist names norm_artist = norm_artist.replace(" and ", " ").replace(" & ", " ") # Create combined key return f"{norm_artist}|{norm_album}" def fuzzy_match(key1: str, key2: str) -> bool: """Check if two album keys are similar enough to be considered the same.""" # Exact match after normalization if key1 == key2: return True # Split into artist and album parts artist1, album1 = key1.split('|', 1) artist2, album2 = key2.split('|', 1) # Check if albums are very similar (allowing for minor variations) # Using simple string comparison - could use more sophisticated matching if artist1 == artist2: # Check album similarity words1 = set(album1.split()) words2 = set(album2.split()) # If most words match, consider it a match if len(words1 & words2) >= min(len(words1), len(words2)) * 0.8: return True return False def find_album_in_2020(artist: str, album: str, albums_2020: Dict[str, Tuple[int, str, str]]) -> Optional[int]: """Find an album in the 2020 list, return its rank if found.""" key_2023 = create_album_key(artist, album) # First try exact match if key_2023 in albums_2020: return albums_2020[key_2023][0] # Try fuzzy matching for key_2020, (rank, _, _) in albums_2020.items(): if fuzzy_match(key_2023, key_2020): return rank return None def main(): # Read 2020 Rolling Stone list albums_2020 = {} # key -> (rank, artist, album) print("Reading 2020 Rolling Stone list...") with open('rolling_stone_top_500_albums_2020.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: rank = int(row['Rank']) artist = row['Artist'] album = row['Album'] key = create_album_key(artist, album) albums_2020[key] = (rank, artist, album) print(f"Loaded {len(albums_2020)} albums from 2020 list") # Read 2023 Wikipedia list and compare results = [] print("\nReading 2023 Wikipedia list and comparing...") with open('wikipedia_top_500_albums.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: rank_2023 = int(row['rank']) artist = row['artist'].strip() album = row['album'].strip() # Find in 2020 list rank_2020 = find_album_in_2020(artist, album, albums_2020) if rank_2020 is None: status = "New in 2023" else: change = rank_2020 - rank_2023 if change == 0: status = "No change" elif change > 0: status = f"+{change}" # Improved ranking (moved up) else: status = str(change) # Dropped ranking (moved down) results.append({ 'Rank': rank_2023, 'Artist': artist, 'Album': album, 'Status': status }) # Write results to CSV print("\nWriting results to top_500_albums_2023.csv...") with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as f: fieldnames = ['Rank', 'Artist', 'Album', 'Status'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(results) # Generate summary statistics new_albums = sum(1 for r in results if r['Status'] == "New in 2023") no_change = sum(1 for r in results if r['Status'] == "No change") improved = sum(1 for r in results if r['Status'].startswith('+')) dropped = sum(1 for r in results if r['Status'].startswith('-') and r['Status'] != "New in 2023") print("\nSummary:") print(f"Total albums in 2023 list: {len(results)}") print(f"New albums in 2023: {new_albums}") print(f"Albums with no ranking change: {no_change}") print(f"Albums that improved ranking: {improved}") print(f"Albums that dropped in ranking: {dropped}") # Find biggest movers biggest_improvements = [] biggest_drops = [] for r in results: if r['Status'].startswith('+'): change = int(r['Status']) biggest_improvements.append((change, r['Artist'], r['Album'], r['Rank'])) elif r['Status'].startswith('-') and r['Status'] != "New in 2023": change = int(r['Status']) biggest_drops.append((change, r['Artist'], r['Album'], r['Rank'])) biggest_improvements.sort(reverse=True) biggest_drops.sort() print("\nTop 5 biggest improvements:") for change, artist, album, rank in biggest_improvements[:5]: print(f" {artist} - {album}: {change} (now at #{rank})") print("\nTop 5 biggest drops:") for change, artist, album, rank in biggest_drops[:5]: print(f" {artist} - {album}: {change} (now at #{rank})") print(f"\nResults saved to: top_500_albums_2023.csv") if __name__ == "__main__": main()