{
"$type": "site.standard.document",
"bskyPostRef": {
"cid": "bafyreig4ixqb4d7o3db6z2g56almix5mjmzg6h2yqxqq467k3skdzbr4xm",
"uri": "at://did:plc:q76iwbelgruxmwyu3ujalh5m/app.bsky.feed.post/3mmt3q77q6fw2"
},
"coverImage": {
"$type": "blob",
"ref": {
"$link": "bafkreia6oilj2wwsjjg4dhru7wi6ma7xx2wsjeuyrce4ck777p5wl4fwma"
},
"mimeType": "image/jpeg",
"size": 107360
},
"path": "/blog/how-databricks-parsed-wikipedia-to-markdown-with-python/",
"publishedAt": "2026-05-26T15:45:00.000Z",
"site": "https://enterprise.wikimedia.com",
"tags": [
"Tutorials"
],
"textContent": "Parsing raw wikitext into a clean text corpus is notoriously hard. Databricks engineers used Wikimedia Enterprise's Structured Contents endpoints and Apache Spark to convert millions of Wikipedia articles to Markdown at scale, skipping the regex-heavy parsing layer entirely.",
"title": "How Databricks Parsed Wikipedia to Markdown with Python"
}