- Create wikipedia_500_albums.csv from Wikipedia:WikiProject_Albums/500 - Generate top_500_albums_2023.csv comparing 2020 vs 2023 rankings - Add ranking change analysis (192 new, 164 improved, 113 dropped) - Integrate Info and Description from 2020 Rolling Stone data - Fill missing album information for 70+ additional albums - Include Python scripts for data processing and analysis - Update CLAUDE.md with project documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
118 lines
No EOL
4.8 KiB
Python
118 lines
No EOL
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Summary script to analyze the current state of the Top 500 Albums CSV file
|
|
and show statistics about missing and updated information.
|
|
"""
|
|
|
|
import csv
|
|
|
|
def read_csv_file(filepath):
|
|
"""Read the CSV file and return the data."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
return list(reader)
|
|
|
|
def analyze_csv_data(data):
|
|
"""Analyze the CSV data and provide statistics."""
|
|
total_albums = len(data)
|
|
|
|
# Count albums with missing info
|
|
missing_info = 0
|
|
missing_description = 0
|
|
missing_both = 0
|
|
complete_albums = 0
|
|
|
|
missing_albums_list = []
|
|
|
|
for row in data:
|
|
has_info = bool(row.get('Info', '').strip())
|
|
has_description = bool(row.get('Description', '').strip())
|
|
|
|
if not has_info:
|
|
missing_info += 1
|
|
if not has_description:
|
|
missing_description += 1
|
|
if not has_info and not has_description:
|
|
missing_both += 1
|
|
missing_albums_list.append(row)
|
|
if has_info and has_description:
|
|
complete_albums += 1
|
|
|
|
return {
|
|
'total_albums': total_albums,
|
|
'missing_info': missing_info,
|
|
'missing_description': missing_description,
|
|
'missing_both': missing_both,
|
|
'complete_albums': complete_albums,
|
|
'missing_albums_list': missing_albums_list
|
|
}
|
|
|
|
def main():
|
|
"""Main function to analyze and summarize album data."""
|
|
csv_file = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
|
|
|
|
print("=== TOP 500 ALBUMS CSV ANALYSIS ===\n")
|
|
|
|
# Read and analyze the data
|
|
data = read_csv_file(csv_file)
|
|
stats = analyze_csv_data(data)
|
|
|
|
print(f"Total albums in database: {stats['total_albums']}")
|
|
print(f"Albums with complete info (both Info and Description): {stats['complete_albums']}")
|
|
print(f"Albums missing Info field: {stats['missing_info']}")
|
|
print(f"Albums missing Description field: {stats['missing_description']}")
|
|
print(f"Albums missing both Info and Description: {stats['missing_both']}")
|
|
|
|
completion_rate = (stats['complete_albums'] / stats['total_albums']) * 100
|
|
print(f"\nCompletion rate: {completion_rate:.1f}%")
|
|
print(f"Albums updated by our scripts: {500 - 192} (original missing) -> {stats['complete_albums']} (current complete)")
|
|
print(f"Total albums updated: {500 - 192 - stats['complete_albums']} albums filled in")
|
|
|
|
if stats['missing_albums_list']:
|
|
print(f"\n=== REMAINING {len(stats['missing_albums_list'])} ALBUMS WITH MISSING INFO ===")
|
|
|
|
# Group by decade or genre for easier analysis
|
|
missing_by_rank = sorted(stats['missing_albums_list'], key=lambda x: int(x['Rank']))
|
|
|
|
print("\nTop 20 albums still needing information:")
|
|
for i, album in enumerate(missing_by_rank[:20]):
|
|
print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")
|
|
|
|
print(f"\n... and {len(missing_by_rank) - 20} more albums")
|
|
|
|
# Show some statistics about the remaining albums
|
|
print(f"\n=== ANALYSIS OF REMAINING ALBUMS ===")
|
|
|
|
# Count by rank ranges
|
|
top_100 = len([a for a in missing_by_rank if int(a['Rank']) <= 100])
|
|
rank_101_200 = len([a for a in missing_by_rank if 101 <= int(a['Rank']) <= 200])
|
|
rank_201_300 = len([a for a in missing_by_rank if 201 <= int(a['Rank']) <= 300])
|
|
rank_301_400 = len([a for a in missing_by_rank if 301 <= int(a['Rank']) <= 400])
|
|
rank_401_500 = len([a for a in missing_by_rank if 401 <= int(a['Rank']) <= 500])
|
|
|
|
print(f"Missing albums by rank range:")
|
|
print(f" Rank 1-100: {top_100:>3} albums")
|
|
print(f" Rank 101-200: {rank_101_200:>3} albums")
|
|
print(f" Rank 201-300: {rank_201_300:>3} albums")
|
|
print(f" Rank 301-400: {rank_301_400:>3} albums")
|
|
print(f" Rank 401-500: {rank_401_500:>3} albums")
|
|
|
|
# Show some notable missing albums that could be researched
|
|
notable_missing = []
|
|
for album in missing_by_rank:
|
|
artist = album['Artist'].lower()
|
|
if any(keyword in artist for keyword in ['beatles', 'bob dylan', 'rolling stones', 'beach boys',
|
|
'led zeppelin', 'pink floyd', 'david bowie', 'radiohead',
|
|
'nirvana', 'sex pistols', 'velvet underground']):
|
|
notable_missing.append(album)
|
|
|
|
if notable_missing:
|
|
print(f"\n=== NOTABLE CLASSIC ARTISTS STILL MISSING INFO ===")
|
|
for album in notable_missing[:10]:
|
|
print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")
|
|
|
|
else:
|
|
print("\n🎉 ALL ALBUMS HAVE COMPLETE INFORMATION! 🎉")
|
|
|
|
if __name__ == "__main__":
|
|
main() |