top500albums/album_info_summary.py

#!/usr/bin/env python3
"""
Summary script to analyze the current state of the Top 500 Albums CSV file
and show statistics about missing and updated information.
"""

import csv

def read_csv_file(filepath):
    """Read the CSV file and return the data."""
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return list(reader)

def analyze_csv_data(data):
    """Analyze the CSV data and provide statistics."""
    total_albums = len(data)

    # Count albums with missing info
    missing_info = 0
    missing_description = 0
    missing_both = 0
    complete_albums = 0

    missing_albums_list = []

    for row in data:
        has_info = bool(row.get('Info', '').strip())
        has_description = bool(row.get('Description', '').strip())

        if not has_info:
            missing_info += 1
        if not has_description:
            missing_description += 1
        if not has_info and not has_description:
            missing_both += 1
            missing_albums_list.append(row)
        if has_info and has_description:
            complete_albums += 1

    return {
        'total_albums': total_albums,
        'missing_info': missing_info,
        'missing_description': missing_description,
        'missing_both': missing_both,
        'complete_albums': complete_albums,
        'missing_albums_list': missing_albums_list
    }

def main():
    """Main function to analyze and summarize album data."""
    csv_file = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'

    print("=== TOP 500 ALBUMS CSV ANALYSIS ===\n")

    # Read and analyze the data
    data = read_csv_file(csv_file)
    stats = analyze_csv_data(data)

    print(f"Total albums in database: {stats['total_albums']}")
    print(f"Albums with complete info (both Info and Description): {stats['complete_albums']}")
    print(f"Albums missing Info field: {stats['missing_info']}")
    print(f"Albums missing Description field: {stats['missing_description']}")
    print(f"Albums missing both Info and Description: {stats['missing_both']}")

    completion_rate = (stats['complete_albums'] / stats['total_albums']) * 100
    print(f"\nCompletion rate: {completion_rate:.1f}%")
    print(f"Albums updated by our scripts: {500 - 192} (original missing) -> {stats['complete_albums']} (current complete)")
    print(f"Total albums updated: {500 - 192 - stats['complete_albums']} albums filled in")

    if stats['missing_albums_list']:
        print(f"\n=== REMAINING {len(stats['missing_albums_list'])} ALBUMS WITH MISSING INFO ===")

        # Group by decade or genre for easier analysis
        missing_by_rank = sorted(stats['missing_albums_list'], key=lambda x: int(x['Rank']))

        print("\nTop 20 albums still needing information:")
        for i, album in enumerate(missing_by_rank[:20]):
            print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")

        print(f"\n... and {len(missing_by_rank) - 20} more albums")

        # Show some statistics about the remaining albums
        print(f"\n=== ANALYSIS OF REMAINING ALBUMS ===")

        # Count by rank ranges
        top_100 = len([a for a in missing_by_rank if int(a['Rank']) <= 100])
        rank_101_200 = len([a for a in missing_by_rank if 101 <= int(a['Rank']) <= 200])
        rank_201_300 = len([a for a in missing_by_rank if 201 <= int(a['Rank']) <= 300])
        rank_301_400 = len([a for a in missing_by_rank if 301 <= int(a['Rank']) <= 400])
        rank_401_500 = len([a for a in missing_by_rank if 401 <= int(a['Rank']) <= 500])

        print(f"Missing albums by rank range:")
        print(f"  Rank 1-100:   {top_100:>3} albums")
        print(f"  Rank 101-200: {rank_101_200:>3} albums")
        print(f"  Rank 201-300: {rank_201_300:>3} albums")
        print(f"  Rank 301-400: {rank_301_400:>3} albums")
        print(f"  Rank 401-500: {rank_401_500:>3} albums")

        # Show some notable missing albums that could be researched
        notable_missing = []
        for album in missing_by_rank:
            artist = album['Artist'].lower()
            if any(keyword in artist for keyword in ['beatles', 'bob dylan', 'rolling stones', 'beach boys',
                                                   'led zeppelin', 'pink floyd', 'david bowie', 'radiohead',
                                                   'nirvana', 'sex pistols', 'velvet underground']):
                notable_missing.append(album)

        if notable_missing:
            print(f"\n=== NOTABLE CLASSIC ARTISTS STILL MISSING INFO ===")
            for album in notable_missing[:10]:
                print(f"{album['Rank']:>3}. {album['Artist']} - {album['Album']}")

    else:
        print("\n🎉 ALL ALBUMS HAVE COMPLETE INFORMATION! 🎉")

if __name__ == "__main__":
    main()