Raw Record Source

{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreig4ixqb4d7o3db6z2g56almix5mjmzg6h2yqxqq467k3skdzbr4xm",
    "uri": "at://did:plc:q76iwbelgruxmwyu3ujalh5m/app.bsky.feed.post/3mmt3q77q6fw2"
  },
  "coverImage": {
    "$type": "blob",
    "ref": {
      "$link": "bafkreia6oilj2wwsjjg4dhru7wi6ma7xx2wsjeuyrce4ck777p5wl4fwma"
    },
    "mimeType": "image/jpeg",
    "size": 107360
  },
  "path": "/blog/how-databricks-parsed-wikipedia-to-markdown-with-python/",
  "publishedAt": "2026-05-26T15:45:00.000Z",
  "site": "https://enterprise.wikimedia.com",
  "tags": [
    "Tutorials"
  ],
  "textContent": "Parsing raw wikitext into a clean text corpus is notoriously hard. Databricks engineers used Wikimedia Enterprise's Structured Contents endpoints and Apache Spark to convert millions of Wikipedia articles to Markdown at scale, skipping the regex-heavy parsing layer entirely.",
  "title": "How Databricks Parsed Wikipedia to Markdown with Python"
}