top500albums/extract_wikipedia_urls.py
Johan Lundberg 5279d3bbba Remove temporary CSV files
- Remove intermediate analysis files used during data processing
- Keep only essential files: top_500_albums_2023.csv, rolling_stone_top_500_albums_2020.csv, wikipedia_top_500_albums.csv
- Add Wikipedia URL extraction script and mapping files

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-02 00:20:00 +02:00

113 lines
No EOL
3.9 KiB
Python

#!/usr/bin/env python3
import urllib.request
import re
import json
import csv
from html.parser import HTMLParser
class WikipediaLinksExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.in_table = False
self.in_row = False
self.in_cell = False
self.current_row = []
self.current_cell_data = []
self.current_link = None
self.data = []
self.cell_count = 0
def handle_starttag(self, tag, attrs):
if tag == 'table':
classes = dict(attrs).get('class', '')
if 'wikitable' in classes:
self.in_table = True
elif self.in_table and tag == 'tr':
self.in_row = True
self.current_row = []
self.cell_count = 0
elif self.in_row and tag in ['td', 'th']:
self.in_cell = True
self.current_cell_data = []
self.current_link = None
self.cell_count += 1
elif self.in_cell and tag == 'a':
href = dict(attrs).get('href', '')
if href.startswith('/wiki/') and self.cell_count == 2: # Album column
self.current_link = href
def handle_data(self, data):
if self.in_cell:
self.current_cell_data.append(data.strip())
def handle_endtag(self, tag):
if tag == 'table':
self.in_table = False
elif tag == 'tr' and self.in_row:
self.in_row = False
if len(self.current_row) >= 3: # Ensure we have rank, album, artist
self.data.append(self.current_row)
elif tag in ['td', 'th'] and self.in_cell:
self.in_cell = False
cell_text = ' '.join(self.current_cell_data).strip()
if self.cell_count == 2 and self.current_link: # Album column with link
self.current_row.append({
'text': cell_text,
'link': self.current_link
})
else:
self.current_row.append(cell_text)
# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Albums/500"
response = urllib.request.urlopen(url)
html_content = response.read().decode('utf-8')
# Parse the HTML
parser = WikipediaLinksExtractor()
parser.feed(html_content)
# Read our CSV file to match album names
albums_mapping = {}
with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
rank = row['Rank']
album = row['Album'].strip()
artist = row['Artist'].strip()
albums_mapping[rank] = {'album': album, 'artist': artist}
# Create the mapping
wikipedia_urls = {}
for row in parser.data[1:]: # Skip header
if len(row) >= 3 and isinstance(row[0], str) and row[0].isdigit():
rank = row[0]
if isinstance(row[1], dict): # Has link
album_text = row[1]['text'].strip()
wiki_path = row[1]['link']
# Clean up the wiki path
wiki_path = wiki_path.replace('/wiki/', '')
# Match with our CSV data
if rank in albums_mapping:
csv_album = albums_mapping[rank]['album']
csv_artist = albums_mapping[rank]['artist']
wikipedia_urls[csv_album] = wiki_path
# Write the mapping to a JSON file
with open('wikipedia_urls_mapping.json', 'w', encoding='utf-8') as f:
json.dump(wikipedia_urls, f, indent=2, ensure_ascii=False)
print(f"Extracted {len(wikipedia_urls)} Wikipedia URLs")
# Also create a JavaScript file with the mapping
js_content = f"""// Auto-generated Wikipedia URL mappings
const wikipediaUrlMappings = {json.dumps(wikipedia_urls, indent=2, ensure_ascii=False)};
export default wikipediaUrlMappings;
"""
with open('wikipedia_urls.js', 'w', encoding='utf-8') as f:
f.write(js_content)
print("Created wikipedia_urls.js file")