Remove temporary CSV files

- Remove intermediate analysis files used during data processing - Keep only essential files: top_500_albums_2023.csv, rolling_stone_top_500_albums_2020.csv, wikipedia_top_500_albums.csv - Add Wikipedia URL extraction script and mapping files 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-02 00:20:00 +02:00 · 2025-07-02 00:20:00 +02:00 · 5279d3bbba
commit 5279d3bbba
parent fce2c6ff1e
9 changed files with 614 additions and 1233 deletions
--- a/extract_wikipedia_urls.py
+++ b/extract_wikipedia_urls.py
@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+import urllib.request
+import re
+import json
+import csv
+from html.parser import HTMLParser
+
+class WikipediaLinksExtractor(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.in_table = False
+        self.in_row = False
+        self.in_cell = False
+        self.current_row = []
+        self.current_cell_data = []
+        self.current_link = None
+        self.data = []
+        self.cell_count = 0
+        
+    def handle_starttag(self, tag, attrs):
+        if tag == 'table':
+            classes = dict(attrs).get('class', '')
+            if 'wikitable' in classes:
+                self.in_table = True
+        elif self.in_table and tag == 'tr':
+            self.in_row = True
+            self.current_row = []
+            self.cell_count = 0
+        elif self.in_row and tag in ['td', 'th']:
+            self.in_cell = True
+            self.current_cell_data = []
+            self.current_link = None
+            self.cell_count += 1
+        elif self.in_cell and tag == 'a':
+            href = dict(attrs).get('href', '')
+            if href.startswith('/wiki/') and self.cell_count == 2:  # Album column
+                self.current_link = href
+                
+    def handle_data(self, data):
+        if self.in_cell:
+            self.current_cell_data.append(data.strip())
+            
+    def handle_endtag(self, tag):
+        if tag == 'table':
+            self.in_table = False
+        elif tag == 'tr' and self.in_row:
+            self.in_row = False
+            if len(self.current_row) >= 3:  # Ensure we have rank, album, artist
+                self.data.append(self.current_row)
+        elif tag in ['td', 'th'] and self.in_cell:
+            self.in_cell = False
+            cell_text = ' '.join(self.current_cell_data).strip()
+            if self.cell_count == 2 and self.current_link:  # Album column with link
+                self.current_row.append({
+                    'text': cell_text,
+                    'link': self.current_link
+                })
+            else:
+                self.current_row.append(cell_text)
+
+# Fetch the Wikipedia page
+url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Albums/500"
+response = urllib.request.urlopen(url)
+html_content = response.read().decode('utf-8')
+
+# Parse the HTML
+parser = WikipediaLinksExtractor()
+parser.feed(html_content)
+
+# Read our CSV file to match album names
+albums_mapping = {}
+with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f:
+    reader = csv.DictReader(f)
+    for row in reader:
+        rank = row['Rank']
+        album = row['Album'].strip()
+        artist = row['Artist'].strip()
+        albums_mapping[rank] = {'album': album, 'artist': artist}
+
+# Create the mapping
+wikipedia_urls = {}
+for row in parser.data[1:]:  # Skip header
+    if len(row) >= 3 and isinstance(row[0], str) and row[0].isdigit():
+        rank = row[0]
+        if isinstance(row[1], dict):  # Has link
+            album_text = row[1]['text'].strip()
+            wiki_path = row[1]['link']
+            # Clean up the wiki path
+            wiki_path = wiki_path.replace('/wiki/', '')
+            
+            # Match with our CSV data
+            if rank in albums_mapping:
+                csv_album = albums_mapping[rank]['album']
+                csv_artist = albums_mapping[rank]['artist']
+                wikipedia_urls[csv_album] = wiki_path
+
+# Write the mapping to a JSON file
+with open('wikipedia_urls_mapping.json', 'w', encoding='utf-8') as f:
+    json.dump(wikipedia_urls, f, indent=2, ensure_ascii=False)
+
+print(f"Extracted {len(wikipedia_urls)} Wikipedia URLs")
+
+# Also create a JavaScript file with the mapping
+js_content = f"""// Auto-generated Wikipedia URL mappings
+const wikipediaUrlMappings = {json.dumps(wikipedia_urls, indent=2, ensure_ascii=False)};
+
+export default wikipediaUrlMappings;
+"""
+
+with open('wikipedia_urls.js', 'w', encoding='utf-8') as f:
+    f.write(js_content)
+
+print("Created wikipedia_urls.js file")