Organize repository by moving all scripts to scripts/ folder
- Moved all Python processing scripts to scripts/ directory for better organization - Preserves git history using git mv command - Clean separation between main project files and utility scripts 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
462fdcfa84
commit
872fdfa0ee
15 changed files with 0 additions and 0 deletions
209
scripts/merge_descriptions.py
Normal file
209
scripts/merge_descriptions.py
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to merge Info and Description columns from 2020 Rolling Stone data
|
||||
into the combined 2023 top 500 albums CSV file.
|
||||
"""
|
||||
|
||||
import csv
|
||||
from difflib import SequenceMatcher
|
||||
import re
|
||||
|
||||
def normalize_string(s):
|
||||
"""Normalize string for better matching"""
|
||||
if s is None or s == '':
|
||||
return ""
|
||||
# Convert to lowercase and remove extra whitespace
|
||||
s = str(s).lower().strip()
|
||||
# Remove common punctuation and special characters
|
||||
s = re.sub(r'[^\w\s&]', '', s)
|
||||
# Replace multiple spaces with single space
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s
|
||||
|
||||
def similarity_ratio(a, b):
|
||||
"""Calculate similarity ratio between two strings using difflib"""
|
||||
return SequenceMatcher(None, a, b).ratio() * 100
|
||||
|
||||
def fuzzy_match_albums(artist1, album1, artist2, album2, threshold=85):
|
||||
"""
|
||||
Use fuzzy matching to determine if two albums are the same
|
||||
Returns True if they match above the threshold
|
||||
"""
|
||||
# Normalize strings
|
||||
artist1_norm = normalize_string(artist1)
|
||||
album1_norm = normalize_string(album1)
|
||||
artist2_norm = normalize_string(artist2)
|
||||
album2_norm = normalize_string(album2)
|
||||
|
||||
# Calculate similarity scores
|
||||
artist_score = similarity_ratio(artist1_norm, artist2_norm)
|
||||
album_score = similarity_ratio(album1_norm, album2_norm)
|
||||
|
||||
# Both artist and album must be above threshold
|
||||
return artist_score >= threshold and album_score >= threshold
|
||||
|
||||
def load_2020_data(filepath):
|
||||
"""Load 2020 data handling multi-line descriptions"""
|
||||
albums_2020 = {}
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8') as file:
|
||||
# Skip the header
|
||||
next(file)
|
||||
|
||||
current_row = None
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
|
||||
# Check if this line starts with a rank number (new record)
|
||||
if line and line[0].isdigit():
|
||||
# Save previous record if exists
|
||||
if current_row:
|
||||
try:
|
||||
# Parse the CSV fields
|
||||
reader = csv.reader([current_row])
|
||||
fields = next(reader)
|
||||
if len(fields) >= 5:
|
||||
rank, artist, album, info, description = fields[:5]
|
||||
key = normalize_string(f"{artist} {album}")
|
||||
albums_2020[key] = {
|
||||
'artist': artist,
|
||||
'album': album,
|
||||
'info': info,
|
||||
'description': description
|
||||
}
|
||||
except:
|
||||
pass # Skip malformed rows
|
||||
|
||||
# Start new record
|
||||
current_row = line
|
||||
else:
|
||||
# Continue multi-line description
|
||||
if current_row:
|
||||
current_row += " " + line
|
||||
|
||||
# Don't forget the last record
|
||||
if current_row:
|
||||
try:
|
||||
reader = csv.reader([current_row])
|
||||
fields = next(reader)
|
||||
if len(fields) >= 5:
|
||||
rank, artist, album, info, description = fields[:5]
|
||||
key = normalize_string(f"{artist} {album}")
|
||||
albums_2020[key] = {
|
||||
'artist': artist,
|
||||
'album': album,
|
||||
'info': info,
|
||||
'description': description
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
return albums_2020
|
||||
|
||||
def load_2023_data(filepath):
|
||||
"""Load 2023 data from CSV"""
|
||||
albums_2023 = []
|
||||
with open(filepath, 'r', encoding='utf-8') as file:
|
||||
reader = csv.DictReader(file)
|
||||
for row in reader:
|
||||
albums_2023.append(row)
|
||||
return albums_2023
|
||||
|
||||
def save_2023_data(filepath, albums_2023):
|
||||
"""Save updated 2023 data to CSV"""
|
||||
if not albums_2023:
|
||||
return
|
||||
|
||||
fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
|
||||
with open(filepath, 'w', newline='', encoding='utf-8') as file:
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for album in albums_2023:
|
||||
writer.writerow(album)
|
||||
|
||||
def main():
|
||||
# File paths
|
||||
file_2023 = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
|
||||
file_2020 = '/home/lundberg/projects/top500albums/rolling_stone_top_500_albums_2020.csv'
|
||||
|
||||
print("Loading 2023 data...")
|
||||
albums_2023 = load_2023_data(file_2023)
|
||||
print(f"Loaded {len(albums_2023)} albums from 2023 data")
|
||||
|
||||
print("Loading 2020 data...")
|
||||
albums_2020 = load_2020_data(file_2020)
|
||||
print(f"Loaded {len(albums_2020)} albums from 2020 data")
|
||||
|
||||
# Add new columns to 2023 data
|
||||
for album in albums_2023:
|
||||
album['Info'] = ''
|
||||
album['Description'] = ''
|
||||
|
||||
matches_found = 0
|
||||
no_matches = 0
|
||||
|
||||
print("Matching albums...")
|
||||
for album_2023 in albums_2023:
|
||||
artist_2023 = album_2023['Artist']
|
||||
album_title_2023 = album_2023['Album']
|
||||
|
||||
# First try exact match
|
||||
key_2023 = normalize_string(f"{artist_2023} {album_title_2023}")
|
||||
|
||||
if key_2023 in albums_2020:
|
||||
# Exact match found
|
||||
album_2023['Info'] = albums_2020[key_2023]['info']
|
||||
album_2023['Description'] = albums_2020[key_2023]['description']
|
||||
matches_found += 1
|
||||
print(f"✓ Exact match: {artist_2023} - {album_title_2023}")
|
||||
else:
|
||||
# Try fuzzy matching
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for key_2020, album_2020 in albums_2020.items():
|
||||
if fuzzy_match_albums(artist_2023, album_title_2023,
|
||||
album_2020['artist'], album_2020['album']):
|
||||
# Calculate combined score for ranking
|
||||
artist_score = similarity_ratio(normalize_string(artist_2023),
|
||||
normalize_string(album_2020['artist']))
|
||||
album_score = similarity_ratio(normalize_string(album_title_2023),
|
||||
normalize_string(album_2020['album']))
|
||||
combined_score = (artist_score + album_score) / 2
|
||||
|
||||
if combined_score > best_score:
|
||||
best_score = combined_score
|
||||
best_match = album_2020
|
||||
|
||||
if best_match:
|
||||
album_2023['Info'] = best_match['info']
|
||||
album_2023['Description'] = best_match['description']
|
||||
matches_found += 1
|
||||
print(f"✓ Fuzzy match ({best_score:.1f}%): {artist_2023} - {album_title_2023} → {best_match['artist']} - {best_match['album']}")
|
||||
else:
|
||||
no_matches += 1
|
||||
print(f"✗ No match: {artist_2023} - {album_title_2023}")
|
||||
|
||||
print(f"\nMatching complete!")
|
||||
print(f"Matches found: {matches_found}")
|
||||
print(f"No matches: {no_matches}")
|
||||
print(f"Total albums: {len(albums_2023)}")
|
||||
|
||||
# Save updated CSV
|
||||
print(f"\nSaving updated file...")
|
||||
save_2023_data(file_2023, albums_2023)
|
||||
print(f"File saved: {file_2023}")
|
||||
|
||||
# Display first few rows with new columns
|
||||
print("\nFirst 5 rows with new columns:")
|
||||
for i, album in enumerate(albums_2023[:5]):
|
||||
print(f"{album['Rank']}: {album['Artist']} - {album['Album']} ({album['Status']})")
|
||||
if album['Info']:
|
||||
print(f" Info: {album['Info']}")
|
||||
if album['Description']:
|
||||
desc = album['Description'][:100] + "..." if len(album['Description']) > 100 else album['Description']
|
||||
print(f" Description: {desc}")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue