top500albums/scripts/merge_descriptions.py

#!/usr/bin/env python3
"""
Script to merge Info and Description columns from 2020 Rolling Stone data
into the combined 2023 top 500 albums CSV file.
"""

import csv
from difflib import SequenceMatcher
import re

def normalize_string(s):
    """Normalize string for better matching"""
    if s is None or s == '':
        return ""
    # Convert to lowercase and remove extra whitespace
    s = str(s).lower().strip()
    # Remove common punctuation and special characters
    s = re.sub(r'[^\w\s&]', '', s)
    # Replace multiple spaces with single space
    s = re.sub(r'\s+', ' ', s)
    return s

def similarity_ratio(a, b):
    """Calculate similarity ratio between two strings using difflib"""
    return SequenceMatcher(None, a, b).ratio() * 100

def fuzzy_match_albums(artist1, album1, artist2, album2, threshold=85):
    """
    Use fuzzy matching to determine if two albums are the same
    Returns True if they match above the threshold
    """
    # Normalize strings
    artist1_norm = normalize_string(artist1)
    album1_norm = normalize_string(album1)
    artist2_norm = normalize_string(artist2)
    album2_norm = normalize_string(album2)

    # Calculate similarity scores
    artist_score = similarity_ratio(artist1_norm, artist2_norm)
    album_score = similarity_ratio(album1_norm, album2_norm)

    # Both artist and album must be above threshold
    return artist_score >= threshold and album_score >= threshold

def load_2020_data(filepath):
    """Load 2020 data handling multi-line descriptions"""
    albums_2020 = {}

    with open(filepath, 'r', encoding='utf-8') as file:
        # Skip the header
        next(file)

        current_row = None
        for line in file:
            line = line.strip()

            # Check if this line starts with a rank number (new record)
            if line and line[0].isdigit():
                # Save previous record if exists
                if current_row:
                    try:
                        # Parse the CSV fields
                        reader = csv.reader([current_row])
                        fields = next(reader)
                        if len(fields) >= 5:
                            rank, artist, album, info, description = fields[:5]
                            key = normalize_string(f"{artist} {album}")
                            albums_2020[key] = {
                                'artist': artist,
                                'album': album,
                                'info': info,
                                'description': description
                            }
                    except:
                        pass  # Skip malformed rows

                # Start new record
                current_row = line
            else:
                # Continue multi-line description
                if current_row:
                    current_row += " " + line

        # Don't forget the last record
        if current_row:
            try:
                reader = csv.reader([current_row])
                fields = next(reader)
                if len(fields) >= 5:
                    rank, artist, album, info, description = fields[:5]
                    key = normalize_string(f"{artist} {album}")
                    albums_2020[key] = {
                        'artist': artist,
                        'album': album,
                        'info': info,
                        'description': description
                    }
            except:
                pass

    return albums_2020

def load_2023_data(filepath):
    """Load 2023 data from CSV"""
    albums_2023 = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            albums_2023.append(row)
    return albums_2023

def save_2023_data(filepath, albums_2023):
    """Save updated 2023 data to CSV"""
    if not albums_2023:
        return

    fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
    with open(filepath, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for album in albums_2023:
            writer.writerow(album)

def main():
    # File paths
    file_2023 = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
    file_2020 = '/home/lundberg/projects/top500albums/rolling_stone_top_500_albums_2020.csv'

    print("Loading 2023 data...")
    albums_2023 = load_2023_data(file_2023)
    print(f"Loaded {len(albums_2023)} albums from 2023 data")

    print("Loading 2020 data...")
    albums_2020 = load_2020_data(file_2020)
    print(f"Loaded {len(albums_2020)} albums from 2020 data")

    # Add new columns to 2023 data
    for album in albums_2023:
        album['Info'] = ''
        album['Description'] = ''

    matches_found = 0
    no_matches = 0

    print("Matching albums...")
    for album_2023 in albums_2023:
        artist_2023 = album_2023['Artist']
        album_title_2023 = album_2023['Album']

        # First try exact match
        key_2023 = normalize_string(f"{artist_2023} {album_title_2023}")

        if key_2023 in albums_2020:
            # Exact match found
            album_2023['Info'] = albums_2020[key_2023]['info']
            album_2023['Description'] = albums_2020[key_2023]['description']
            matches_found += 1
            print(f"✓ Exact match: {artist_2023} - {album_title_2023}")
        else:
            # Try fuzzy matching
            best_match = None
            best_score = 0

            for key_2020, album_2020 in albums_2020.items():
                if fuzzy_match_albums(artist_2023, album_title_2023,
                                    album_2020['artist'], album_2020['album']):
                    # Calculate combined score for ranking
                    artist_score = similarity_ratio(normalize_string(artist_2023),
                                                   normalize_string(album_2020['artist']))
                    album_score = similarity_ratio(normalize_string(album_title_2023),
                                                 normalize_string(album_2020['album']))
                    combined_score = (artist_score + album_score) / 2

                    if combined_score > best_score:
                        best_score = combined_score
                        best_match = album_2020

            if best_match:
                album_2023['Info'] = best_match['info']
                album_2023['Description'] = best_match['description']
                matches_found += 1
                print(f"✓ Fuzzy match ({best_score:.1f}%): {artist_2023} - {album_title_2023} → {best_match['artist']} - {best_match['album']}")
            else:
                no_matches += 1
                print(f"✗ No match: {artist_2023} - {album_title_2023}")

    print(f"\nMatching complete!")
    print(f"Matches found: {matches_found}")
    print(f"No matches: {no_matches}")
    print(f"Total albums: {len(albums_2023)}")

    # Save updated CSV
    print(f"\nSaving updated file...")
    save_2023_data(file_2023, albums_2023)
    print(f"File saved: {file_2023}")

    # Display first few rows with new columns
    print("\nFirst 5 rows with new columns:")
    for i, album in enumerate(albums_2023[:5]):
        print(f"{album['Rank']}: {album['Artist']} - {album['Album']} ({album['Status']})")
        if album['Info']:
            print(f"   Info: {album['Info']}")
        if album['Description']:
            desc = album['Description'][:100] + "..." if len(album['Description']) > 100 else album['Description']
            print(f"   Description: {desc}")
        print()

if __name__ == "__main__":
    main()