{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreihhywmdvndbex4g7q3mfguurf3cppupx4ejrorbw2jfesho6atasi",
    "uri": "at://did:plc:pgryn3ephfd2xgft23qokfzt/app.bsky.feed.post/3mirkcmj6mbw2"
  },
  "path": "/t/how-to-decode-csm-tokens-into-audio-tensors-for-streaming/160345#post_3",
  "publishedAt": "2026-04-05T17:19:18.000Z",
  "site": "https://discuss.huggingface.co",
  "tags": [
    "https://github.com/D3velop-llc/csm-rtx5090"
  ],
  "textContent": "I built a streaming pipeline for CSM-1B that handles the token-to-audio decode. The key issue is that HF’s StaticCache uses index_copy_ which breaks CUDA graphs. Replacing it with slice assignment + a persistent backbone cache gets you reduce-overhead compilation. Full code with patches and a demo server: https://github.com/D3velop-llc/csm-rtx5090",
  "title": "How to decode CSM tokens into audio tensors for streaming"
}