top500albums/scripts/remove_duplicate_dropped.py

#!/usr/bin/env python3
"""
Remove duplicate dropped albums that have slightly different names.
"""

import csv

def normalize_for_comparison(text):
    """Normalize album names for duplicate detection"""
    text = text.lower().strip()
    # Remove "The" from album names in parentheses
    text = text.replace('(the black album)', '(black album)')
    text = text.replace('(the blue album)', '(blue album)')
    return text

def main():
    # Read current CSV
    albums = []
    with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            albums.append(row)

    print(f"📊 Total albums before cleanup: {len(albums)}")

    # Find duplicates among dropped albums
    seen_dropped = {}
    duplicates = []

    for i, album in enumerate(albums):
        if 'Dropped' in album['Status']:
            key = (normalize_for_comparison(album['Artist']),
                   normalize_for_comparison(album['Album']))

            if key in seen_dropped:
                print(f"❌ Duplicate found:")
                print(f"   First:  Rank {seen_dropped[key]['Rank']} - {seen_dropped[key]['Artist']} - {seen_dropped[key]['Album']}")
                print(f"   Second: Rank {album['Rank']} - {album['Artist']} - {album['Album']}")
                duplicates.append(i)
            else:
                seen_dropped[key] = album

    # Remove duplicates
    if duplicates:
        print(f"\n🗑️  Removing {len(duplicates)} duplicate entries...")
        # Remove in reverse order to maintain indices
        for i in reversed(duplicates):
            del albums[i]

        # Renumber albums after 500
        current_rank = 501
        for album in albums:
            if int(album['Rank']) > 500:
                album['Rank'] = str(current_rank)
                current_rank += 1

    # Write cleaned CSV
    with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(albums)

    print(f"\n✅ Cleanup complete!")
    print(f"📊 Total albums now: {len(albums)}")
    print(f"📊 Total dropped albums: {len([a for a in albums if 'Dropped' in a['Status']])}")

if __name__ == "__main__":
    main()