- Remove intermediate analysis files used during data processing - Keep only essential files: top_500_albums_2023.csv, rolling_stone_top_500_albums_2020.csv, wikipedia_top_500_albums.csv - Add Wikipedia URL extraction script and mapping files 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
113 lines
No EOL
3.9 KiB
Python
113 lines
No EOL
3.9 KiB
Python
#!/usr/bin/env python3
|
|
import urllib.request
|
|
import re
|
|
import json
|
|
import csv
|
|
from html.parser import HTMLParser
|
|
|
|
class WikipediaLinksExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.in_table = False
|
|
self.in_row = False
|
|
self.in_cell = False
|
|
self.current_row = []
|
|
self.current_cell_data = []
|
|
self.current_link = None
|
|
self.data = []
|
|
self.cell_count = 0
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == 'table':
|
|
classes = dict(attrs).get('class', '')
|
|
if 'wikitable' in classes:
|
|
self.in_table = True
|
|
elif self.in_table and tag == 'tr':
|
|
self.in_row = True
|
|
self.current_row = []
|
|
self.cell_count = 0
|
|
elif self.in_row and tag in ['td', 'th']:
|
|
self.in_cell = True
|
|
self.current_cell_data = []
|
|
self.current_link = None
|
|
self.cell_count += 1
|
|
elif self.in_cell and tag == 'a':
|
|
href = dict(attrs).get('href', '')
|
|
if href.startswith('/wiki/') and self.cell_count == 2: # Album column
|
|
self.current_link = href
|
|
|
|
def handle_data(self, data):
|
|
if self.in_cell:
|
|
self.current_cell_data.append(data.strip())
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag == 'table':
|
|
self.in_table = False
|
|
elif tag == 'tr' and self.in_row:
|
|
self.in_row = False
|
|
if len(self.current_row) >= 3: # Ensure we have rank, album, artist
|
|
self.data.append(self.current_row)
|
|
elif tag in ['td', 'th'] and self.in_cell:
|
|
self.in_cell = False
|
|
cell_text = ' '.join(self.current_cell_data).strip()
|
|
if self.cell_count == 2 and self.current_link: # Album column with link
|
|
self.current_row.append({
|
|
'text': cell_text,
|
|
'link': self.current_link
|
|
})
|
|
else:
|
|
self.current_row.append(cell_text)
|
|
|
|
# Fetch the Wikipedia page
|
|
url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Albums/500"
|
|
response = urllib.request.urlopen(url)
|
|
html_content = response.read().decode('utf-8')
|
|
|
|
# Parse the HTML
|
|
parser = WikipediaLinksExtractor()
|
|
parser.feed(html_content)
|
|
|
|
# Read our CSV file to match album names
|
|
albums_mapping = {}
|
|
with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
rank = row['Rank']
|
|
album = row['Album'].strip()
|
|
artist = row['Artist'].strip()
|
|
albums_mapping[rank] = {'album': album, 'artist': artist}
|
|
|
|
# Create the mapping
|
|
wikipedia_urls = {}
|
|
for row in parser.data[1:]: # Skip header
|
|
if len(row) >= 3 and isinstance(row[0], str) and row[0].isdigit():
|
|
rank = row[0]
|
|
if isinstance(row[1], dict): # Has link
|
|
album_text = row[1]['text'].strip()
|
|
wiki_path = row[1]['link']
|
|
# Clean up the wiki path
|
|
wiki_path = wiki_path.replace('/wiki/', '')
|
|
|
|
# Match with our CSV data
|
|
if rank in albums_mapping:
|
|
csv_album = albums_mapping[rank]['album']
|
|
csv_artist = albums_mapping[rank]['artist']
|
|
wikipedia_urls[csv_album] = wiki_path
|
|
|
|
# Write the mapping to a JSON file
|
|
with open('wikipedia_urls_mapping.json', 'w', encoding='utf-8') as f:
|
|
json.dump(wikipedia_urls, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Extracted {len(wikipedia_urls)} Wikipedia URLs")
|
|
|
|
# Also create a JavaScript file with the mapping
|
|
js_content = f"""// Auto-generated Wikipedia URL mappings
|
|
const wikipediaUrlMappings = {json.dumps(wikipedia_urls, indent=2, ensure_ascii=False)};
|
|
|
|
export default wikipediaUrlMappings;
|
|
"""
|
|
|
|
with open('wikipedia_urls.js', 'w', encoding='utf-8') as f:
|
|
f.write(js_content)
|
|
|
|
print("Created wikipedia_urls.js file") |