#!/usr/bin/env python3 """ Remove duplicate dropped albums that have slightly different names. """ import csv def normalize_for_comparison(text): """Normalize album names for duplicate detection""" text = text.lower().strip() # Remove "The" from album names in parentheses text = text.replace('(the black album)', '(black album)') text = text.replace('(the blue album)', '(blue album)') return text def main(): # Read current CSV albums = [] with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: albums.append(row) print(f"šŸ“Š Total albums before cleanup: {len(albums)}") # Find duplicates among dropped albums seen_dropped = {} duplicates = [] for i, album in enumerate(albums): if 'Dropped' in album['Status']: key = (normalize_for_comparison(album['Artist']), normalize_for_comparison(album['Album'])) if key in seen_dropped: print(f"āŒ Duplicate found:") print(f" First: Rank {seen_dropped[key]['Rank']} - {seen_dropped[key]['Artist']} - {seen_dropped[key]['Album']}") print(f" Second: Rank {album['Rank']} - {album['Artist']} - {album['Album']}") duplicates.append(i) else: seen_dropped[key] = album # Remove duplicates if duplicates: print(f"\nšŸ—‘ļø Removing {len(duplicates)} duplicate entries...") # Remove in reverse order to maintain indices for i in reversed(duplicates): del albums[i] # Renumber albums after 500 current_rank = 501 for album in albums: if int(album['Rank']) > 500: album['Rank'] = str(current_rank) current_rank += 1 # Write cleaned CSV with open('top_500_albums_2023.csv', 'w', newline='', encoding='utf-8') as file: fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() writer.writerows(albums) print(f"\nāœ… Cleanup complete!") print(f"šŸ“Š Total albums now: {len(albums)}") print(f"šŸ“Š Total dropped albums: {len([a for a in albums if 'Dropped' in a['Status']])}") if __name__ == "__main__": main()