#!/usr/bin/env python3 """ Summary script to analyze the current state of the Top 500 Albums CSV file and show statistics about missing and updated information. """ import csv def read_csv_file(filepath): """Read the CSV file and return the data.""" with open(filepath, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) return list(reader) def analyze_csv_data(data): """Analyze the CSV data and provide statistics.""" total_albums = len(data) # Count albums with missing info missing_info = 0 missing_description = 0 missing_both = 0 complete_albums = 0 missing_albums_list = [] for row in data: has_info = bool(row.get('Info', '').strip()) has_description = bool(row.get('Description', '').strip()) if not has_info: missing_info += 1 if not has_description: missing_description += 1 if not has_info and not has_description: missing_both += 1 missing_albums_list.append(row) if has_info and has_description: complete_albums += 1 return { 'total_albums': total_albums, 'missing_info': missing_info, 'missing_description': missing_description, 'missing_both': missing_both, 'complete_albums': complete_albums, 'missing_albums_list': missing_albums_list } def main(): """Main function to analyze and summarize album data.""" csv_file = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv' print("=== TOP 500 ALBUMS CSV ANALYSIS ===\n") # Read and analyze the data data = read_csv_file(csv_file) stats = analyze_csv_data(data) print(f"Total albums in database: {stats['total_albums']}") print(f"Albums with complete info (both Info and Description): {stats['complete_albums']}") print(f"Albums missing Info field: {stats['missing_info']}") print(f"Albums missing Description field: {stats['missing_description']}") print(f"Albums missing both Info and Description: {stats['missing_both']}") completion_rate = (stats['complete_albums'] / stats['total_albums']) * 100 print(f"\nCompletion rate: {completion_rate:.1f}%") print(f"Albums updated by our scripts: {500 - 192} (original missing) -> {stats['complete_albums']} (current complete)") print(f"Total albums updated: {500 - 192 - stats['complete_albums']} albums filled in") if stats['missing_albums_list']: print(f"\n=== REMAINING {len(stats['missing_albums_list'])} ALBUMS WITH MISSING INFO ===") # Group by decade or genre for easier analysis missing_by_rank = sorted(stats['missing_albums_list'], key=lambda x: int(x['Rank'])) print("\nTop 20 albums still needing information:") for i, album in enumerate(missing_by_rank[:20]): print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}") print(f"\n... and {len(missing_by_rank) - 20} more albums") # Show some statistics about the remaining albums print(f"\n=== ANALYSIS OF REMAINING ALBUMS ===") # Count by rank ranges top_100 = len([a for a in missing_by_rank if int(a['Rank']) <= 100]) rank_101_200 = len([a for a in missing_by_rank if 101 <= int(a['Rank']) <= 200]) rank_201_300 = len([a for a in missing_by_rank if 201 <= int(a['Rank']) <= 300]) rank_301_400 = len([a for a in missing_by_rank if 301 <= int(a['Rank']) <= 400]) rank_401_500 = len([a for a in missing_by_rank if 401 <= int(a['Rank']) <= 500]) print(f"Missing albums by rank range:") print(f" Rank 1-100: {top_100:>3} albums") print(f" Rank 101-200: {rank_101_200:>3} albums") print(f" Rank 201-300: {rank_201_300:>3} albums") print(f" Rank 301-400: {rank_301_400:>3} albums") print(f" Rank 401-500: {rank_401_500:>3} albums") # Show some notable missing albums that could be researched notable_missing = [] for album in missing_by_rank: artist = album['Artist'].lower() if any(keyword in artist for keyword in ['beatles', 'bob dylan', 'rolling stones', 'beach boys', 'led zeppelin', 'pink floyd', 'david bowie', 'radiohead', 'nirvana', 'sex pistols', 'velvet underground']): notable_missing.append(album) if notable_missing: print(f"\n=== NOTABLE CLASSIC ARTISTS STILL MISSING INFO ===") for album in notable_missing[:10]: print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}") else: print("\nšŸŽ‰ ALL ALBUMS HAVE COMPLETE INFORMATION! šŸŽ‰") if __name__ == "__main__": main()