{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreiar7ouny5hpb7xubcr7i2txbe7d4gu5odonsrltaayvpyvzyljiue",
    "uri": "at://did:plc:ibeorkuxddnwy45ii5pezbgb/app.bsky.feed.post/3mjpdzaqsl232"
  },
  "coverImage": {
    "$type": "blob",
    "ref": {
      "$link": "bafkreialfunbq6cvwf2dxtfiivqwzi7gltepplwunbosfw23va7gfs6ucq"
    },
    "mimeType": "image/png",
    "size": 107191
  },
  "path": "/unweight-tensor-compression/",
  "publishedAt": "2026-04-17T13:00:00.000Z",
  "site": "https://blog.cloudflare.com",
  "tags": [
    "Agents Week",
    "Research",
    "AI"
  ],
  "textContent": "Running LLMs across Cloudflare’s network requires us to be smarter and more efficient about GPU memory bandwidth. That’s why we developed Unweight, a lossless inference-time compression system that achieves up to a 22% model footprint reduction, so that we can deliver faster and cheaper inference than ever before.",
  "title": "Unweight: how we compressed an LLM 22% without sacrificing quality"
}