#!/usr/bin/env python3 """ Script to merge Info and Description columns from 2020 Rolling Stone data into the combined 2023 top 500 albums CSV file. """ import csv from difflib import SequenceMatcher import re def normalize_string(s): """Normalize string for better matching""" if s is None or s == '': return "" # Convert to lowercase and remove extra whitespace s = str(s).lower().strip() # Remove common punctuation and special characters s = re.sub(r'[^\w\s&]', '', s) # Replace multiple spaces with single space s = re.sub(r'\s+', ' ', s) return s def similarity_ratio(a, b): """Calculate similarity ratio between two strings using difflib""" return SequenceMatcher(None, a, b).ratio() * 100 def fuzzy_match_albums(artist1, album1, artist2, album2, threshold=85): """ Use fuzzy matching to determine if two albums are the same Returns True if they match above the threshold """ # Normalize strings artist1_norm = normalize_string(artist1) album1_norm = normalize_string(album1) artist2_norm = normalize_string(artist2) album2_norm = normalize_string(album2) # Calculate similarity scores artist_score = similarity_ratio(artist1_norm, artist2_norm) album_score = similarity_ratio(album1_norm, album2_norm) # Both artist and album must be above threshold return artist_score >= threshold and album_score >= threshold def load_2020_data(filepath): """Load 2020 data handling multi-line descriptions""" albums_2020 = {} with open(filepath, 'r', encoding='utf-8') as file: # Skip the header next(file) current_row = None for line in file: line = line.strip() # Check if this line starts with a rank number (new record) if line and line[0].isdigit(): # Save previous record if exists if current_row: try: # Parse the CSV fields reader = csv.reader([current_row]) fields = next(reader) if len(fields) >= 5: rank, artist, album, info, description = fields[:5] key = normalize_string(f"{artist} {album}") albums_2020[key] = { 'artist': artist, 'album': album, 'info': info, 'description': description } except: pass # Skip malformed rows # Start new record current_row = line else: # Continue multi-line description if current_row: current_row += " " + line # Don't forget the last record if current_row: try: reader = csv.reader([current_row]) fields = next(reader) if len(fields) >= 5: rank, artist, album, info, description = fields[:5] key = normalize_string(f"{artist} {album}") albums_2020[key] = { 'artist': artist, 'album': album, 'info': info, 'description': description } except: pass return albums_2020 def load_2023_data(filepath): """Load 2023 data from CSV""" albums_2023 = [] with open(filepath, 'r', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: albums_2023.append(row) return albums_2023 def save_2023_data(filepath, albums_2023): """Save updated 2023 data to CSV""" if not albums_2023: return fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description'] with open(filepath, 'w', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() for album in albums_2023: writer.writerow(album) def main(): # File paths file_2023 = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv' file_2020 = '/home/lundberg/projects/top500albums/rolling_stone_top_500_albums_2020.csv' print("Loading 2023 data...") albums_2023 = load_2023_data(file_2023) print(f"Loaded {len(albums_2023)} albums from 2023 data") print("Loading 2020 data...") albums_2020 = load_2020_data(file_2020) print(f"Loaded {len(albums_2020)} albums from 2020 data") # Add new columns to 2023 data for album in albums_2023: album['Info'] = '' album['Description'] = '' matches_found = 0 no_matches = 0 print("Matching albums...") for album_2023 in albums_2023: artist_2023 = album_2023['Artist'] album_title_2023 = album_2023['Album'] # First try exact match key_2023 = normalize_string(f"{artist_2023} {album_title_2023}") if key_2023 in albums_2020: # Exact match found album_2023['Info'] = albums_2020[key_2023]['info'] album_2023['Description'] = albums_2020[key_2023]['description'] matches_found += 1 print(f"✓ Exact match: {artist_2023} - {album_title_2023}") else: # Try fuzzy matching best_match = None best_score = 0 for key_2020, album_2020 in albums_2020.items(): if fuzzy_match_albums(artist_2023, album_title_2023, album_2020['artist'], album_2020['album']): # Calculate combined score for ranking artist_score = similarity_ratio(normalize_string(artist_2023), normalize_string(album_2020['artist'])) album_score = similarity_ratio(normalize_string(album_title_2023), normalize_string(album_2020['album'])) combined_score = (artist_score + album_score) / 2 if combined_score > best_score: best_score = combined_score best_match = album_2020 if best_match: album_2023['Info'] = best_match['info'] album_2023['Description'] = best_match['description'] matches_found += 1 print(f"✓ Fuzzy match ({best_score:.1f}%): {artist_2023} - {album_title_2023} → {best_match['artist']} - {best_match['album']}") else: no_matches += 1 print(f"✗ No match: {artist_2023} - {album_title_2023}") print(f"\nMatching complete!") print(f"Matches found: {matches_found}") print(f"No matches: {no_matches}") print(f"Total albums: {len(albums_2023)}") # Save updated CSV print(f"\nSaving updated file...") save_2023_data(file_2023, albums_2023) print(f"File saved: {file_2023}") # Display first few rows with new columns print("\nFirst 5 rows with new columns:") for i, album in enumerate(albums_2023[:5]): print(f"{album['Rank']}: {album['Artist']} - {album['Album']} ({album['Status']})") if album['Info']: print(f" Info: {album['Info']}") if album['Description']: desc = album['Description'][:100] + "..." if len(album['Description']) > 100 else album['Description'] print(f" Description: {desc}") print() if __name__ == "__main__": main()