top500albums/extract_wikipedia_urls.py

#!/usr/bin/env python3
import urllib.request
import re
import json
import csv
from html.parser import HTMLParser

class WikipediaLinksExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_table = False
        self.in_row = False
        self.in_cell = False
        self.current_row = []
        self.current_cell_data = []
        self.current_link = None
        self.data = []
        self.cell_count = 0

    def handle_starttag(self, tag, attrs):
        if tag == 'table':
            classes = dict(attrs).get('class', '')
            if 'wikitable' in classes:
                self.in_table = True
        elif self.in_table and tag == 'tr':
            self.in_row = True
            self.current_row = []
            self.cell_count = 0
        elif self.in_row and tag in ['td', 'th']:
            self.in_cell = True
            self.current_cell_data = []
            self.current_link = None
            self.cell_count += 1
        elif self.in_cell and tag == 'a':
            href = dict(attrs).get('href', '')
            if href.startswith('/wiki/') and self.cell_count == 2:  # Album column
                self.current_link = href

    def handle_data(self, data):
        if self.in_cell:
            self.current_cell_data.append(data.strip())

    def handle_endtag(self, tag):
        if tag == 'table':
            self.in_table = False
        elif tag == 'tr' and self.in_row:
            self.in_row = False
            if len(self.current_row) >= 3:  # Ensure we have rank, album, artist
                self.data.append(self.current_row)
        elif tag in ['td', 'th'] and self.in_cell:
            self.in_cell = False
            cell_text = ' '.join(self.current_cell_data).strip()
            if self.cell_count == 2 and self.current_link:  # Album column with link
                self.current_row.append({
                    'text': cell_text,
                    'link': self.current_link
                })
            else:
                self.current_row.append(cell_text)

# Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Albums/500"
response = urllib.request.urlopen(url)
html_content = response.read().decode('utf-8')

# Parse the HTML
parser = WikipediaLinksExtractor()
parser.feed(html_content)

# Read our CSV file to match album names
albums_mapping = {}
with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        rank = row['Rank']
        album = row['Album'].strip()
        artist = row['Artist'].strip()
        albums_mapping[rank] = {'album': album, 'artist': artist}

# Create the mapping
wikipedia_urls = {}
for row in parser.data[1:]:  # Skip header
    if len(row) >= 3 and isinstance(row[0], str) and row[0].isdigit():
        rank = row[0]
        if isinstance(row[1], dict):  # Has link
            album_text = row[1]['text'].strip()
            wiki_path = row[1]['link']
            # Clean up the wiki path
            wiki_path = wiki_path.replace('/wiki/', '')

            # Match with our CSV data
            if rank in albums_mapping:
                csv_album = albums_mapping[rank]['album']
                csv_artist = albums_mapping[rank]['artist']
                wikipedia_urls[csv_album] = wiki_path

# Write the mapping to a JSON file
with open('wikipedia_urls_mapping.json', 'w', encoding='utf-8') as f:
    json.dump(wikipedia_urls, f, indent=2, ensure_ascii=False)

print(f"Extracted {len(wikipedia_urls)} Wikipedia URLs")

# Also create a JavaScript file with the mapping
js_content = f"""// Auto-generated Wikipedia URL mappings
const wikipediaUrlMappings = {json.dumps(wikipedia_urls, indent=2, ensure_ascii=False)};

export default wikipediaUrlMappings;
"""

with open('wikipedia_urls.js', 'w', encoding='utf-8') as f:
    f.write(js_content)

print("Created wikipedia_urls.js file")