Raw Record Source

{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreidbjll3oyrbunvqhierypxoc3pilbkhgy5vtdjsf4qqgdt3c2pq5u",
    "uri": "at://did:plc:l5eubmmg57bpj76f4frmgy3r/app.bsky.feed.post/3mo5yharcidf2"
  },
  "coverImage": {
    "$type": "blob",
    "ref": {
      "$link": "bafkreibhuafblgztdowb73svcu3jhxwrdhu5slg3razt2e3lbonnh54spm"
    },
    "mimeType": "image/png",
    "size": 159713
  },
  "path": "/blog/olmo-eval",
  "publishedAt": "2026-06-12T08:00:00.000Z",
  "site": "https://allenai.org",
  "textContent": "olmo-eval is an open evaluation workbench that helps model developers add, run, and analyze benchmarks across changing LLM checkpoints, extending OLMES from final-score reproducibility into the day-to-day model development loop.",
  "title": "olmo-eval: An evaluation workbench for the model development loop"
}