Remove temporary CSV files
- Remove intermediate analysis files used during data processing - Keep only essential files: top_500_albums_2023.csv, rolling_stone_top_500_albums_2020.csv, wikipedia_top_500_albums.csv - Add Wikipedia URL extraction script and mapping files 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
fce2c6ff1e
commit
5279d3bbba
9 changed files with 614 additions and 1233 deletions
113
extract_wikipedia_urls.py
Normal file
113
extract_wikipedia_urls.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
#!/usr/bin/env python3
|
||||
import urllib.request
|
||||
import re
|
||||
import json
|
||||
import csv
|
||||
from html.parser import HTMLParser
|
||||
|
||||
class WikipediaLinksExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.in_table = False
|
||||
self.in_row = False
|
||||
self.in_cell = False
|
||||
self.current_row = []
|
||||
self.current_cell_data = []
|
||||
self.current_link = None
|
||||
self.data = []
|
||||
self.cell_count = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'table':
|
||||
classes = dict(attrs).get('class', '')
|
||||
if 'wikitable' in classes:
|
||||
self.in_table = True
|
||||
elif self.in_table and tag == 'tr':
|
||||
self.in_row = True
|
||||
self.current_row = []
|
||||
self.cell_count = 0
|
||||
elif self.in_row and tag in ['td', 'th']:
|
||||
self.in_cell = True
|
||||
self.current_cell_data = []
|
||||
self.current_link = None
|
||||
self.cell_count += 1
|
||||
elif self.in_cell and tag == 'a':
|
||||
href = dict(attrs).get('href', '')
|
||||
if href.startswith('/wiki/') and self.cell_count == 2: # Album column
|
||||
self.current_link = href
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_cell:
|
||||
self.current_cell_data.append(data.strip())
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'table':
|
||||
self.in_table = False
|
||||
elif tag == 'tr' and self.in_row:
|
||||
self.in_row = False
|
||||
if len(self.current_row) >= 3: # Ensure we have rank, album, artist
|
||||
self.data.append(self.current_row)
|
||||
elif tag in ['td', 'th'] and self.in_cell:
|
||||
self.in_cell = False
|
||||
cell_text = ' '.join(self.current_cell_data).strip()
|
||||
if self.cell_count == 2 and self.current_link: # Album column with link
|
||||
self.current_row.append({
|
||||
'text': cell_text,
|
||||
'link': self.current_link
|
||||
})
|
||||
else:
|
||||
self.current_row.append(cell_text)
|
||||
|
||||
# Fetch the Wikipedia page
|
||||
url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Albums/500"
|
||||
response = urllib.request.urlopen(url)
|
||||
html_content = response.read().decode('utf-8')
|
||||
|
||||
# Parse the HTML
|
||||
parser = WikipediaLinksExtractor()
|
||||
parser.feed(html_content)
|
||||
|
||||
# Read our CSV file to match album names
|
||||
albums_mapping = {}
|
||||
with open('top_500_albums_2023.csv', 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
rank = row['Rank']
|
||||
album = row['Album'].strip()
|
||||
artist = row['Artist'].strip()
|
||||
albums_mapping[rank] = {'album': album, 'artist': artist}
|
||||
|
||||
# Create the mapping
|
||||
wikipedia_urls = {}
|
||||
for row in parser.data[1:]: # Skip header
|
||||
if len(row) >= 3 and isinstance(row[0], str) and row[0].isdigit():
|
||||
rank = row[0]
|
||||
if isinstance(row[1], dict): # Has link
|
||||
album_text = row[1]['text'].strip()
|
||||
wiki_path = row[1]['link']
|
||||
# Clean up the wiki path
|
||||
wiki_path = wiki_path.replace('/wiki/', '')
|
||||
|
||||
# Match with our CSV data
|
||||
if rank in albums_mapping:
|
||||
csv_album = albums_mapping[rank]['album']
|
||||
csv_artist = albums_mapping[rank]['artist']
|
||||
wikipedia_urls[csv_album] = wiki_path
|
||||
|
||||
# Write the mapping to a JSON file
|
||||
with open('wikipedia_urls_mapping.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(wikipedia_urls, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Extracted {len(wikipedia_urls)} Wikipedia URLs")
|
||||
|
||||
# Also create a JavaScript file with the mapping
|
||||
js_content = f"""// Auto-generated Wikipedia URL mappings
|
||||
const wikipediaUrlMappings = {json.dumps(wikipedia_urls, indent=2, ensure_ascii=False)};
|
||||
|
||||
export default wikipediaUrlMappings;
|
||||
"""
|
||||
|
||||
with open('wikipedia_urls.js', 'w', encoding='utf-8') as f:
|
||||
f.write(js_content)
|
||||
|
||||
print("Created wikipedia_urls.js file")
|
||||
Loading…
Add table
Add a link
Reference in a new issue