Organize repository by moving all scripts to scripts/ folder

- Moved all Python processing scripts to scripts/ directory for better organization - Preserves git history using git mv command - Clean separation between main project files and utility scripts 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-01 00:36:08 +02:00 · 2025-07-01 00:36:08 +02:00 · 872fdfa0ee
commit 872fdfa0ee
parent 462fdcfa84
15 changed files with 0 additions and 0 deletions
--- a/scripts/merge_descriptions.py
+++ b/scripts/merge_descriptions.py
@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Script to merge Info and Description columns from 2020 Rolling Stone data
+into the combined 2023 top 500 albums CSV file.
+"""
+
+import csv
+from difflib import SequenceMatcher
+import re
+
+def normalize_string(s):
+    """Normalize string for better matching"""
+    if s is None or s == '':
+        return ""
+    # Convert to lowercase and remove extra whitespace
+    s = str(s).lower().strip()
+    # Remove common punctuation and special characters
+    s = re.sub(r'[^\w\s&]', '', s)
+    # Replace multiple spaces with single space
+    s = re.sub(r'\s+', ' ', s)
+    return s
+
+def similarity_ratio(a, b):
+    """Calculate similarity ratio between two strings using difflib"""
+    return SequenceMatcher(None, a, b).ratio() * 100
+
+def fuzzy_match_albums(artist1, album1, artist2, album2, threshold=85):
+    """
+    Use fuzzy matching to determine if two albums are the same
+    Returns True if they match above the threshold
+    """
+    # Normalize strings
+    artist1_norm = normalize_string(artist1)
+    album1_norm = normalize_string(album1)
+    artist2_norm = normalize_string(artist2)
+    album2_norm = normalize_string(album2)
+    
+    # Calculate similarity scores
+    artist_score = similarity_ratio(artist1_norm, artist2_norm)
+    album_score = similarity_ratio(album1_norm, album2_norm)
+    
+    # Both artist and album must be above threshold
+    return artist_score >= threshold and album_score >= threshold
+
+def load_2020_data(filepath):
+    """Load 2020 data handling multi-line descriptions"""
+    albums_2020 = {}
+    
+    with open(filepath, 'r', encoding='utf-8') as file:
+        # Skip the header
+        next(file)
+        
+        current_row = None
+        for line in file:
+            line = line.strip()
+            
+            # Check if this line starts with a rank number (new record)
+            if line and line[0].isdigit():
+                # Save previous record if exists
+                if current_row:
+                    try:
+                        # Parse the CSV fields
+                        reader = csv.reader([current_row])
+                        fields = next(reader)
+                        if len(fields) >= 5:
+                            rank, artist, album, info, description = fields[:5]
+                            key = normalize_string(f"{artist} {album}")
+                            albums_2020[key] = {
+                                'artist': artist,
+                                'album': album,
+                                'info': info,
+                                'description': description
+                            }
+                    except:
+                        pass  # Skip malformed rows
+                
+                # Start new record
+                current_row = line
+            else:
+                # Continue multi-line description
+                if current_row:
+                    current_row += " " + line
+        
+        # Don't forget the last record
+        if current_row:
+            try:
+                reader = csv.reader([current_row])
+                fields = next(reader)
+                if len(fields) >= 5:
+                    rank, artist, album, info, description = fields[:5]
+                    key = normalize_string(f"{artist} {album}")
+                    albums_2020[key] = {
+                        'artist': artist,
+                        'album': album,
+                        'info': info,
+                        'description': description
+                    }
+            except:
+                pass
+    
+    return albums_2020
+
+def load_2023_data(filepath):
+    """Load 2023 data from CSV"""
+    albums_2023 = []
+    with open(filepath, 'r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            albums_2023.append(row)
+    return albums_2023
+
+def save_2023_data(filepath, albums_2023):
+    """Save updated 2023 data to CSV"""
+    if not albums_2023:
+        return
+    
+    fieldnames = ['Rank', 'Artist', 'Album', 'Status', 'Info', 'Description']
+    with open(filepath, 'w', newline='', encoding='utf-8') as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        for album in albums_2023:
+            writer.writerow(album)
+
+def main():
+    # File paths
+    file_2023 = '/home/lundberg/projects/top500albums/top_500_albums_2023.csv'
+    file_2020 = '/home/lundberg/projects/top500albums/rolling_stone_top_500_albums_2020.csv'
+    
+    print("Loading 2023 data...")
+    albums_2023 = load_2023_data(file_2023)
+    print(f"Loaded {len(albums_2023)} albums from 2023 data")
+    
+    print("Loading 2020 data...")
+    albums_2020 = load_2020_data(file_2020)
+    print(f"Loaded {len(albums_2020)} albums from 2020 data")
+    
+    # Add new columns to 2023 data
+    for album in albums_2023:
+        album['Info'] = ''
+        album['Description'] = ''
+    
+    matches_found = 0
+    no_matches = 0
+    
+    print("Matching albums...")
+    for album_2023 in albums_2023:
+        artist_2023 = album_2023['Artist']
+        album_title_2023 = album_2023['Album']
+        
+        # First try exact match
+        key_2023 = normalize_string(f"{artist_2023} {album_title_2023}")
+        
+        if key_2023 in albums_2020:
+            # Exact match found
+            album_2023['Info'] = albums_2020[key_2023]['info']
+            album_2023['Description'] = albums_2020[key_2023]['description']
+            matches_found += 1
+            print(f"✓ Exact match: {artist_2023} - {album_title_2023}")
+        else:
+            # Try fuzzy matching
+            best_match = None
+            best_score = 0
+            
+            for key_2020, album_2020 in albums_2020.items():
+                if fuzzy_match_albums(artist_2023, album_title_2023, 
+                                    album_2020['artist'], album_2020['album']):
+                    # Calculate combined score for ranking
+                    artist_score = similarity_ratio(normalize_string(artist_2023), 
+                                                   normalize_string(album_2020['artist']))
+                    album_score = similarity_ratio(normalize_string(album_title_2023), 
+                                                 normalize_string(album_2020['album']))
+                    combined_score = (artist_score + album_score) / 2
+                    
+                    if combined_score > best_score:
+                        best_score = combined_score
+                        best_match = album_2020
+            
+            if best_match:
+                album_2023['Info'] = best_match['info']
+                album_2023['Description'] = best_match['description']
+                matches_found += 1
+                print(f"✓ Fuzzy match ({best_score:.1f}%): {artist_2023} - {album_title_2023} → {best_match['artist']} - {best_match['album']}")
+            else:
+                no_matches += 1
+                print(f"✗ No match: {artist_2023} - {album_title_2023}")
+    
+    print(f"\nMatching complete!")
+    print(f"Matches found: {matches_found}")
+    print(f"No matches: {no_matches}")
+    print(f"Total albums: {len(albums_2023)}")
+    
+    # Save updated CSV
+    print(f"\nSaving updated file...")
+    save_2023_data(file_2023, albums_2023)
+    print(f"File saved: {file_2023}")
+    
+    # Display first few rows with new columns
+    print("\nFirst 5 rows with new columns:")
+    for i, album in enumerate(albums_2023[:5]):
+        print(f"{album['Rank']}: {album['Artist']} - {album['Album']} ({album['Status']})")
+        if album['Info']:
+            print(f"   Info: {album['Info']}")
+        if album['Description']:
+            desc = album['Description'][:100] + "..." if len(album['Description']) > 100 else album['Description']
+            print(f"   Description: {desc}")
+        print()
+
+if __name__ == "__main__":
+    main()