top500albums/scripts/merge_descriptions.py
Johan Lundberg 872fdfa0ee Organize repository by moving all scripts to scripts/ folder
- Moved all Python processing scripts to scripts/ directory for better organization
- Preserves git history using git mv command
- Clean separation between main project files and utility scripts

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-01 00:36:08 +02:00

209 lines
No EOL
7.7 KiB
Python

#!/usr/bin/env python3
"""
Script to merge Info and Description columns from 2020 Rolling Stone data
into the combined 2023 top 500 albums CSV file.
"""
import csv
from difflib import SequenceMatcher
import re
def normalize_string(s):
"""Normalize string for better matching"""
if s is None or s == '':
return ""
# Convert to lowercase and remove extra whitespace
s = str(s).lower().strip()
# Remove common punctuation and special characters
s = re.sub(r'[^\w\s&]', '', s)
# Replace multiple spaces with single space
s = re.sub(r'\s+', ' ', s)
return s
def similarity_ratio(a, b):
"""Calculate similarity ratio between two strings using difflib"""
return SequenceMatcher(None, a, b).ratio() * 100
def fuzzy_match_albums(artist1, album1, artist2, album2, threshold=85):
"""
Use fuzzy matching to determine if two albums are the same
Returns True if they match above the threshold
"""
# Normalize strings
artist1_norm = normalize_string(artist1)
album1_norm = normalize_string(album1)
artist2_norm = normalize_string(artist2)
album2_norm = normalize_string(album2)
# Calculate similarity scores
artist_score = similarity_ratio(artist1_norm, artist2_norm)
album_score = similarity_ratio(album1_norm, album2_norm)
# Both artist and album must be above threshold
return artist_score >= threshold and album_score >= threshold
def load_2020_data(filepath):
"""Load 2020 data handling multi-line descriptions"""
albums_2020 = {}
with open(filepath, 'r', encoding='utf-8') as file:
# Skip the header
next(file)
current_row = None
for line in file:
line = line.strip()
# Check if this line starts with a rank number (new record)
if line and line[0].isdigit():
# Save previous record if exists
if current_row:
try:
# Parse the CSV fields
reader = csv.reader([current_row])
fields = next(reader)
if len(fields) >= 5:
rank, artist, album, info, description = fields[:5]
key = normalize_string(f"{artist} {album}")
albums_2020[key] = {
'artist': artist,
'album': album,
'info': info,
'description': description
}
except:
pass # Skip malformed rows
# Start new record
current_row = line
else:
# Continue multi-line description
if current_row:
current_row += " " + line
# Don't forget the last record
if current_row:
try:
reader = csv.reader([current_row])
fields = next(reader)
if len(fields) >= 5:
rank, artist, album, info, description = fields[:5]
key = normalize_string(f"{artist} {album}")
albums_2020[key] = {
'artist': artist,
'album': album,
'info': info,
'description': description
}
except:
pass
return albums_2020
def load_2023_data(filepath):
"""Load 2023 data from CSV"""
albums_2023 = []
with open(filepath, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
albums_2023.append(row)
return albums_2023
def save_2023_data(filepath, albums_2023):
"""Save updated 2023 data to CSV"""
if not albums_2023:
return
fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
with open(filepath, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for album in albums_2023:
writer.writerow(album)
def main():
# File paths
file_2023 = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
file_2020 = '/home/lundberg/projects/top500albums/rolling_stone_top_500_albums_2020.csv'
print("Loading 2023 data...")
albums_2023 = load_2023_data(file_2023)
print(f"Loaded {len(albums_2023)} albums from 2023 data")
print("Loading 2020 data...")
albums_2020 = load_2020_data(file_2020)
print(f"Loaded {len(albums_2020)} albums from 2020 data")
# Add new columns to 2023 data
for album in albums_2023:
album['Info'] = ''
album['Description'] = ''
matches_found = 0
no_matches = 0
print("Matching albums...")
for album_2023 in albums_2023:
artist_2023 = album_2023['Artist']
album_title_2023 = album_2023['Album']
# First try exact match
key_2023 = normalize_string(f"{artist_2023} {album_title_2023}")
if key_2023 in albums_2020:
# Exact match found
album_2023['Info'] = albums_2020[key_2023]['info']
album_2023['Description'] = albums_2020[key_2023]['description']
matches_found += 1
print(f"✓ Exact match: {artist_2023} - {album_title_2023}")
else:
# Try fuzzy matching
best_match = None
best_score = 0
for key_2020, album_2020 in albums_2020.items():
if fuzzy_match_albums(artist_2023, album_title_2023,
album_2020['artist'], album_2020['album']):
# Calculate combined score for ranking
artist_score = similarity_ratio(normalize_string(artist_2023),
normalize_string(album_2020['artist']))
album_score = similarity_ratio(normalize_string(album_title_2023),
normalize_string(album_2020['album']))
combined_score = (artist_score + album_score) / 2
if combined_score > best_score:
best_score = combined_score
best_match = album_2020
if best_match:
album_2023['Info'] = best_match['info']
album_2023['Description'] = best_match['description']
matches_found += 1
print(f"✓ Fuzzy match ({best_score:.1f}%): {artist_2023} - {album_title_2023}{best_match['artist']} - {best_match['album']}")
else:
no_matches += 1
print(f"✗ No match: {artist_2023} - {album_title_2023}")
print(f"\nMatching complete!")
print(f"Matches found: {matches_found}")
print(f"No matches: {no_matches}")
print(f"Total albums: {len(albums_2023)}")
# Save updated CSV
print(f"\nSaving updated file...")
save_2023_data(file_2023, albums_2023)
print(f"File saved: {file_2023}")
# Display first few rows with new columns
print("\nFirst 5 rows with new columns:")
for i, album in enumerate(albums_2023[:5]):
print(f"{album['Rank']}: {album['Artist']} - {album['Album']} ({album['Status']})")
if album['Info']:
print(f" Info: {album['Info']}")
if album['Description']:
desc = album['Description'][:100] + "..." if len(album['Description']) > 100 else album['Description']
print(f" Description: {desc}")
print()
if __name__ == "__main__":
main()