- Moved all Python processing scripts to scripts/ directory for better organization - Preserves git history using git mv command - Clean separation between main project files and utility scripts 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
209 lines
No EOL
7.7 KiB
Python
209 lines
No EOL
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to merge Info and Description columns from 2020 Rolling Stone data
|
|
into the combined 2023 top 500 albums CSV file.
|
|
"""
|
|
|
|
import csv
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
def normalize_string(s):
|
|
"""Normalize string for better matching"""
|
|
if s is None or s == '':
|
|
return ""
|
|
# Convert to lowercase and remove extra whitespace
|
|
s = str(s).lower().strip()
|
|
# Remove common punctuation and special characters
|
|
s = re.sub(r'[^\w\s&]', '', s)
|
|
# Replace multiple spaces with single space
|
|
s = re.sub(r'\s+', ' ', s)
|
|
return s
|
|
|
|
def similarity_ratio(a, b):
|
|
"""Calculate similarity ratio between two strings using difflib"""
|
|
return SequenceMatcher(None, a, b).ratio() * 100
|
|
|
|
def fuzzy_match_albums(artist1, album1, artist2, album2, threshold=85):
|
|
"""
|
|
Use fuzzy matching to determine if two albums are the same
|
|
Returns True if they match above the threshold
|
|
"""
|
|
# Normalize strings
|
|
artist1_norm = normalize_string(artist1)
|
|
album1_norm = normalize_string(album1)
|
|
artist2_norm = normalize_string(artist2)
|
|
album2_norm = normalize_string(album2)
|
|
|
|
# Calculate similarity scores
|
|
artist_score = similarity_ratio(artist1_norm, artist2_norm)
|
|
album_score = similarity_ratio(album1_norm, album2_norm)
|
|
|
|
# Both artist and album must be above threshold
|
|
return artist_score >= threshold and album_score >= threshold
|
|
|
|
def load_2020_data(filepath):
|
|
"""Load 2020 data handling multi-line descriptions"""
|
|
albums_2020 = {}
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as file:
|
|
# Skip the header
|
|
next(file)
|
|
|
|
current_row = None
|
|
for line in file:
|
|
line = line.strip()
|
|
|
|
# Check if this line starts with a rank number (new record)
|
|
if line and line[0].isdigit():
|
|
# Save previous record if exists
|
|
if current_row:
|
|
try:
|
|
# Parse the CSV fields
|
|
reader = csv.reader([current_row])
|
|
fields = next(reader)
|
|
if len(fields) >= 5:
|
|
rank, artist, album, info, description = fields[:5]
|
|
key = normalize_string(f"{artist} {album}")
|
|
albums_2020[key] = {
|
|
'artist': artist,
|
|
'album': album,
|
|
'info': info,
|
|
'description': description
|
|
}
|
|
except:
|
|
pass # Skip malformed rows
|
|
|
|
# Start new record
|
|
current_row = line
|
|
else:
|
|
# Continue multi-line description
|
|
if current_row:
|
|
current_row += " " + line
|
|
|
|
# Don't forget the last record
|
|
if current_row:
|
|
try:
|
|
reader = csv.reader([current_row])
|
|
fields = next(reader)
|
|
if len(fields) >= 5:
|
|
rank, artist, album, info, description = fields[:5]
|
|
key = normalize_string(f"{artist} {album}")
|
|
albums_2020[key] = {
|
|
'artist': artist,
|
|
'album': album,
|
|
'info': info,
|
|
'description': description
|
|
}
|
|
except:
|
|
pass
|
|
|
|
return albums_2020
|
|
|
|
def load_2023_data(filepath):
|
|
"""Load 2023 data from CSV"""
|
|
albums_2023 = []
|
|
with open(filepath, 'r', encoding='utf-8') as file:
|
|
reader = csv.DictReader(file)
|
|
for row in reader:
|
|
albums_2023.append(row)
|
|
return albums_2023
|
|
|
|
def save_2023_data(filepath, albums_2023):
|
|
"""Save updated 2023 data to CSV"""
|
|
if not albums_2023:
|
|
return
|
|
|
|
fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
|
|
with open(filepath, 'w', newline='', encoding='utf-8') as file:
|
|
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
for album in albums_2023:
|
|
writer.writerow(album)
|
|
|
|
def main():
|
|
# File paths
|
|
file_2023 = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
|
|
file_2020 = '/home/lundberg/projects/top500albums/rolling_stone_top_500_albums_2020.csv'
|
|
|
|
print("Loading 2023 data...")
|
|
albums_2023 = load_2023_data(file_2023)
|
|
print(f"Loaded {len(albums_2023)} albums from 2023 data")
|
|
|
|
print("Loading 2020 data...")
|
|
albums_2020 = load_2020_data(file_2020)
|
|
print(f"Loaded {len(albums_2020)} albums from 2020 data")
|
|
|
|
# Add new columns to 2023 data
|
|
for album in albums_2023:
|
|
album['Info'] = ''
|
|
album['Description'] = ''
|
|
|
|
matches_found = 0
|
|
no_matches = 0
|
|
|
|
print("Matching albums...")
|
|
for album_2023 in albums_2023:
|
|
artist_2023 = album_2023['Artist']
|
|
album_title_2023 = album_2023['Album']
|
|
|
|
# First try exact match
|
|
key_2023 = normalize_string(f"{artist_2023} {album_title_2023}")
|
|
|
|
if key_2023 in albums_2020:
|
|
# Exact match found
|
|
album_2023['Info'] = albums_2020[key_2023]['info']
|
|
album_2023['Description'] = albums_2020[key_2023]['description']
|
|
matches_found += 1
|
|
print(f"✓ Exact match: {artist_2023} - {album_title_2023}")
|
|
else:
|
|
# Try fuzzy matching
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for key_2020, album_2020 in albums_2020.items():
|
|
if fuzzy_match_albums(artist_2023, album_title_2023,
|
|
album_2020['artist'], album_2020['album']):
|
|
# Calculate combined score for ranking
|
|
artist_score = similarity_ratio(normalize_string(artist_2023),
|
|
normalize_string(album_2020['artist']))
|
|
album_score = similarity_ratio(normalize_string(album_title_2023),
|
|
normalize_string(album_2020['album']))
|
|
combined_score = (artist_score + album_score) / 2
|
|
|
|
if combined_score > best_score:
|
|
best_score = combined_score
|
|
best_match = album_2020
|
|
|
|
if best_match:
|
|
album_2023['Info'] = best_match['info']
|
|
album_2023['Description'] = best_match['description']
|
|
matches_found += 1
|
|
print(f"✓ Fuzzy match ({best_score:.1f}%): {artist_2023} - {album_title_2023} → {best_match['artist']} - {best_match['album']}")
|
|
else:
|
|
no_matches += 1
|
|
print(f"✗ No match: {artist_2023} - {album_title_2023}")
|
|
|
|
print(f"\nMatching complete!")
|
|
print(f"Matches found: {matches_found}")
|
|
print(f"No matches: {no_matches}")
|
|
print(f"Total albums: {len(albums_2023)}")
|
|
|
|
# Save updated CSV
|
|
print(f"\nSaving updated file...")
|
|
save_2023_data(file_2023, albums_2023)
|
|
print(f"File saved: {file_2023}")
|
|
|
|
# Display first few rows with new columns
|
|
print("\nFirst 5 rows with new columns:")
|
|
for i, album in enumerate(albums_2023[:5]):
|
|
print(f"{album['Rank']}: {album['Artist']} - {album['Album']} ({album['Status']})")
|
|
if album['Info']:
|
|
print(f" Info: {album['Info']}")
|
|
if album['Description']:
|
|
desc = album['Description'][:100] + "..." if len(album['Description']) > 100 else album['Description']
|
|
print(f" Description: {desc}")
|
|
print()
|
|
|
|
if __name__ == "__main__":
|
|
main() |