Add comprehensive Top 500 Albums analysis with Wikipedia data integration

- Create wikipedia_500_albums.csv from Wikipedia:WikiProject_Albums/500
- Generate top_500_albums_2023.csv comparing 2020 vs 2023 rankings
- Add ranking change analysis (192 new, 164 improved, 113 dropped)
- Integrate Info and Description from 2020 Rolling Stone data
- Fill missing album information for 70+ additional albums
- Include Python scripts for data processing and analysis
- Update CLAUDE.md with project documentation

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Johan Lundberg 2025-06-30 21:17:25 +02:00
parent 49ce813e59
commit 97ea973de0
8 changed files with 2607 additions and 0 deletions

118
album_info_summary.py Normal file
View file

@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
Summary script to analyze the current state of the Top 500 Albums CSV file
and show statistics about missing and updated information.
"""
import csv
def read_csv_file(filepath):
"""Read the CSV file and return the data."""
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
return list(reader)
def analyze_csv_data(data):
"""Analyze the CSV data and provide statistics."""
total_albums = len(data)
# Count albums with missing info
missing_info = 0
missing_description = 0
missing_both = 0
complete_albums = 0
missing_albums_list = []
for row in data:
has_info = bool(row.get('Info', '').strip())
has_description = bool(row.get('Description', '').strip())
if not has_info:
missing_info += 1
if not has_description:
missing_description += 1
if not has_info and not has_description:
missing_both += 1
missing_albums_list.append(row)
if has_info and has_description:
complete_albums += 1
return {
'total_albums': total_albums,
'missing_info': missing_info,
'missing_description': missing_description,
'missing_both': missing_both,
'complete_albums': complete_albums,
'missing_albums_list': missing_albums_list
}
def main():
"""Main function to analyze and summarize album data."""
csv_file = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
print("=== TOP 500 ALBUMS CSV ANALYSIS ===\n")
# Read and analyze the data
data = read_csv_file(csv_file)
stats = analyze_csv_data(data)
print(f"Total albums in database: {stats['total_albums']}")
print(f"Albums with complete info (both Info and Description): {stats['complete_albums']}")
print(f"Albums missing Info field: {stats['missing_info']}")
print(f"Albums missing Description field: {stats['missing_description']}")
print(f"Albums missing both Info and Description: {stats['missing_both']}")
completion_rate = (stats['complete_albums'] / stats['total_albums']) * 100
print(f"\nCompletion rate: {completion_rate:.1f}%")
print(f"Albums updated by our scripts: {500 - 192} (original missing) -> {stats['complete_albums']} (current complete)")
print(f"Total albums updated: {500 - 192 - stats['complete_albums']} albums filled in")
if stats['missing_albums_list']:
print(f"\n=== REMAINING {len(stats['missing_albums_list'])} ALBUMS WITH MISSING INFO ===")
# Group by decade or genre for easier analysis
missing_by_rank = sorted(stats['missing_albums_list'], key=lambda x: int(x['Rank']))
print("\nTop 20 albums still needing information:")
for i, album in enumerate(missing_by_rank[:20]):
print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")
print(f"\n... and {len(missing_by_rank) - 20} more albums")
# Show some statistics about the remaining albums
print(f"\n=== ANALYSIS OF REMAINING ALBUMS ===")
# Count by rank ranges
top_100 = len([a for a in missing_by_rank if int(a['Rank']) <= 100])
rank_101_200 = len([a for a in missing_by_rank if 101 <= int(a['Rank']) <= 200])
rank_201_300 = len([a for a in missing_by_rank if 201 <= int(a['Rank']) <= 300])
rank_301_400 = len([a for a in missing_by_rank if 301 <= int(a['Rank']) <= 400])
rank_401_500 = len([a for a in missing_by_rank if 401 <= int(a['Rank']) <= 500])
print(f"Missing albums by rank range:")
print(f" Rank 1-100: {top_100:>3} albums")
print(f" Rank 101-200: {rank_101_200:>3} albums")
print(f" Rank 201-300: {rank_201_300:>3} albums")
print(f" Rank 301-400: {rank_301_400:>3} albums")
print(f" Rank 401-500: {rank_401_500:>3} albums")
# Show some notable missing albums that could be researched
notable_missing = []
for album in missing_by_rank:
artist = album['Artist'].lower()
if any(keyword in artist for keyword in ['beatles', 'bob dylan', 'rolling stones', 'beach boys',
'led zeppelin', 'pink floyd', 'david bowie', 'radiohead',
'nirvana', 'sex pistols', 'velvet underground']):
notable_missing.append(album)
if notable_missing:
print(f"\n=== NOTABLE CLASSIC ARTISTS STILL MISSING INFO ===")
for album in notable_missing[:10]:
print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")
else:
print("\n🎉 ALL ALBUMS HAVE COMPLETE INFORMATION! 🎉")
if __name__ == "__main__":
main()