#!/usr/bin/env python3 import urllib.request import re import json import csv from html.parser import HTMLParser class WikipediaLinksExtractor(HTMLParser): def __init__(self): super().__init__() self.in_table = False self.in_row = False self.in_cell = False self.current_row = [] self.current_cell_data = [] self.current_link = None self.data = [] self.cell_count = 0 def handle_starttag(self, tag, attrs): if tag == 'table': classes = dict(attrs).get('class', '') if 'wikitable' in classes: self.in_table = True elif self.in_table and tag == 'tr': self.in_row = True self.current_row = [] self.cell_count = 0 elif self.in_row and tag in ['td', 'th']: self.in_cell = True self.current_cell_data = [] self.current_link = None self.cell_count += 1 elif self.in_cell and tag == 'a': href = dict(attrs).get('href', '') if href.startswith('/wiki/') and self.cell_count == 2: # Album column self.current_link = href def handle_data(self, data): if self.in_cell: self.current_cell_data.append(data.strip()) def handle_endtag(self, tag): if tag == 'table': self.in_table = False elif tag == 'tr' and self.in_row: self.in_row = False if len(self.current_row) >= 3: # Ensure we have rank, album, artist self.data.append(self.current_row) elif tag in ['td', 'th'] and self.in_cell: self.in_cell = False cell_text = ' '.join(self.current_cell_data).strip() if self.cell_count == 2 and self.current_link: # Album column with link self.current_row.append({ 'text': cell_text, 'link': self.current_link }) else: self.current_row.append(cell_text) # Fetch the Wikipedia page url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Albums/500" response = urllib.request.urlopen(url) html_content = response.read().decode('utf-8') # Parse the HTML parser = WikipediaLinksExtractor() parser.feed(html_content) # Read our CSV file to match album names albums_mapping = {} with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: rank = row['Rank'] album = row['Album'].strip() artist = row['Artist'].strip() albums_mapping[rank] = {'album': album, 'artist': artist} # Create the mapping wikipedia_urls = {} for row in parser.data[1:]: # Skip header if len(row) >= 3 and isinstance(row[0], str) and row[0].isdigit(): rank = row[0] if isinstance(row[1], dict): # Has link album_text = row[1]['text'].strip() wiki_path = row[1]['link'] # Clean up the wiki path wiki_path = wiki_path.replace('/wiki/', '') # Match with our CSV data if rank in albums_mapping: csv_album = albums_mapping[rank]['album'] csv_artist = albums_mapping[rank]['artist'] wikipedia_urls[csv_album] = wiki_path # Write the mapping to a JSON file with open('wikipedia_urls_mapping.json', 'w', encoding='utf-8') as f: json.dump(wikipedia_urls, f, indent=2, ensure_ascii=False) print(f"Extracted {len(wikipedia_urls)} Wikipedia URLs") # Also create a JavaScript file with the mapping js_content = f"""// Auto-generated Wikipedia URL mappings const wikipediaUrlMappings = {json.dumps(wikipedia_urls, indent=2, ensure_ascii=False)}; export default wikipediaUrlMappings; """ with open('wikipedia_urls.js', 'w', encoding='utf-8') as f: f.write(js_content) print("Created wikipedia_urls.js file")