{
"$type": "site.standard.document",
"bskyPostRef": {
"cid": "bafyreiar7ouny5hpb7xubcr7i2txbe7d4gu5odonsrltaayvpyvzyljiue",
"uri": "at://did:plc:ibeorkuxddnwy45ii5pezbgb/app.bsky.feed.post/3mjpdzaqsl232"
},
"coverImage": {
"$type": "blob",
"ref": {
"$link": "bafkreialfunbq6cvwf2dxtfiivqwzi7gltepplwunbosfw23va7gfs6ucq"
},
"mimeType": "image/png",
"size": 107191
},
"path": "/unweight-tensor-compression/",
"publishedAt": "2026-04-17T13:00:00.000Z",
"site": "https://blog.cloudflare.com",
"tags": [
"Agents Week",
"Research",
"AI"
],
"textContent": "Running LLMs across Cloudflare’s network requires us to be smarter and more efficient about GPU memory bandwidth. That’s why we developed Unweight, a lossless inference-time compression system that achieves up to a 22% model footprint reduction, so that we can deliver faster and cheaper inference than ever before.",
"title": "Unweight: how we compressed an LLM 22% without sacrificing quality"
}