{
"$type": "site.standard.document",
"bskyPostRef": {
"cid": "bafyreiepajufgi27h35mhmpsrcfbmda3bmkbilzzg5sxoo74s5xkwyqs4e",
"uri": "at://did:plc:ibeorkuxddnwy45ii5pezbgb/app.bsky.feed.post/3mjnvexw57k32"
},
"coverImage": {
"$type": "blob",
"ref": {
"$link": "bafkreieyxghunodyqiirgoyvk3w2liqzp5od6dpnk3l575rbeaxddkyxdi"
},
"mimeType": "image/png",
"size": 170405
},
"path": "/high-performance-llms/",
"publishedAt": "2026-04-16T14:00:00.000Z",
"site": "https://blog.cloudflare.com",
"tags": [
"Agents Week",
"Agents",
"AI",
"Developer Platform",
"Developers",
"Infrastructure",
"Workers AI"
],
"textContent": "We built a custom technology stack to run fast large language models on Cloudflare’s infrastructure. This post explores the engineering trade-offs and technical optimizations required to make high-performance AI inference accessible.",
"title": "Building the foundation for running extra-large language models"
}