Add comprehensive Top 500 Albums analysis with Wikipedia data integration

- Create wikipedia_500_albums.csv from Wikipedia:WikiProject_Albums/500 - Generate top_500_albums_2023.csv comparing 2020 vs 2023 rankings - Add ranking change analysis (192 new, 164 improved, 113 dropped) - Integrate Info and Description from 2020 Rolling Stone data - Fill missing album information for 70+ additional albums - Include Python scripts for data processing and analysis - Update CLAUDE.md with project documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-30 21:17:25 +02:00 · 2025-06-30 21:17:25 +02:00 · 97ea973de0
commit 97ea973de0
parent 49ce813e59
8 changed files with 2607 additions and 0 deletions
--- a/album_info_summary.py
+++ b/album_info_summary.py
@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""
+Summary script to analyze the current state of the Top 500 Albums CSV file
+and show statistics about missing and updated information.
+"""
+
+import csv
+
+def read_csv_file(filepath):
+    """Read the CSV file and return the data."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        return list(reader)
+
+def analyze_csv_data(data):
+    """Analyze the CSV data and provide statistics."""
+    total_albums = len(data)
+    
+    # Count albums with missing info
+    missing_info = 0
+    missing_description = 0
+    missing_both = 0
+    complete_albums = 0
+    
+    missing_albums_list = []
+    
+    for row in data:
+        has_info = bool(row.get('Info', '').strip())
+        has_description = bool(row.get('Description', '').strip())
+        
+        if not has_info:
+            missing_info += 1
+        if not has_description:
+            missing_description += 1
+        if not has_info and not has_description:
+            missing_both += 1
+            missing_albums_list.append(row)
+        if has_info and has_description:
+            complete_albums += 1
+    
+    return {
+        'total_albums': total_albums,
+        'missing_info': missing_info,
+        'missing_description': missing_description,
+        'missing_both': missing_both,
+        'complete_albums': complete_albums,
+        'missing_albums_list': missing_albums_list
+    }
+
+def main():
+    """Main function to analyze and summarize album data."""
+    csv_file = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
+    
+    print("=== TOP 500 ALBUMS CSV ANALYSIS ===\n")
+    
+    # Read and analyze the data
+    data = read_csv_file(csv_file)
+    stats = analyze_csv_data(data)
+    
+    print(f"Total albums in database: {stats['total_albums']}")
+    print(f"Albums with complete info (both Info and Description): {stats['complete_albums']}")
+    print(f"Albums missing Info field: {stats['missing_info']}")
+    print(f"Albums missing Description field: {stats['missing_description']}")
+    print(f"Albums missing both Info and Description: {stats['missing_both']}")
+    
+    completion_rate = (stats['complete_albums'] / stats['total_albums']) * 100
+    print(f"\nCompletion rate: {completion_rate:.1f}%")
+    print(f"Albums updated by our scripts: {500 - 192} (original missing) -> {stats['complete_albums']} (current complete)")
+    print(f"Total albums updated: {500 - 192 - stats['complete_albums']} albums filled in")
+    
+    if stats['missing_albums_list']:
+        print(f"\n=== REMAINING {len(stats['missing_albums_list'])} ALBUMS WITH MISSING INFO ===")
+        
+        # Group by decade or genre for easier analysis
+        missing_by_rank = sorted(stats['missing_albums_list'], key=lambda x: int(x['Rank']))
+        
+        print("\nTop 20 albums still needing information:")
+        for i, album in enumerate(missing_by_rank[:20]):
+            print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")
+        
+        print(f"\n... and {len(missing_by_rank) - 20} more albums")
+        
+        # Show some statistics about the remaining albums
+        print(f"\n=== ANALYSIS OF REMAINING ALBUMS ===")
+        
+        # Count by rank ranges
+        top_100 = len([a for a in missing_by_rank if int(a['Rank']) <= 100])
+        rank_101_200 = len([a for a in missing_by_rank if 101 <= int(a['Rank']) <= 200])
+        rank_201_300 = len([a for a in missing_by_rank if 201 <= int(a['Rank']) <= 300])
+        rank_301_400 = len([a for a in missing_by_rank if 301 <= int(a['Rank']) <= 400])
+        rank_401_500 = len([a for a in missing_by_rank if 401 <= int(a['Rank']) <= 500])
+        
+        print(f"Missing albums by rank range:")
+        print(f"  Rank 1-100:   {top_100:>3} albums")
+        print(f"  Rank 101-200: {rank_101_200:>3} albums")
+        print(f"  Rank 201-300: {rank_201_300:>3} albums")
+        print(f"  Rank 301-400: {rank_301_400:>3} albums")
+        print(f"  Rank 401-500: {rank_401_500:>3} albums")
+        
+        # Show some notable missing albums that could be researched
+        notable_missing = []
+        for album in missing_by_rank:
+            artist = album['Artist'].lower()
+            if any(keyword in artist for keyword in ['beatles', 'bob dylan', 'rolling stones', 'beach boys', 
+                                                   'led zeppelin', 'pink floyd', 'david bowie', 'radiohead',
+                                                   'nirvana', 'sex pistols', 'velvet underground']):
+                notable_missing.append(album)
+        
+        if notable_missing:
+            print(f"\n=== NOTABLE CLASSIC ARTISTS STILL MISSING INFO ===")
+            for album in notable_missing[:10]:
+                print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")
+    
+    else:
+        print("\n🎉 ALL ALBUMS HAVE COMPLETE INFORMATION! 🎉")
+
+if __name__ == "__main__":
+    main()