Raw Record Source

{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreihjo6u4ek4gwwu2qpkwd2vg4uqag2rj6j34kzuyzfei54d3hyojyi",
    "uri": "at://did:plc:pgryn3ephfd2xgft23qokfzt/app.bsky.feed.post/3mjuc2ir4a7q2"
  },
  "path": "/t/trace-score-a-metric-for-multi-turn-llm-consistency/175383#post_1",
  "publishedAt": "2026-04-19T14:26:45.000Z",
  "site": "https://discuss.huggingface.co",
  "tags": [
    "trace-score · PyPI",
    "github.com",
    "GitHub - Giri530/trace-score"
  ],
  "textContent": "Built a metric that evaluates the full conversation arc instead of individual turns.\n\nBERTScore for a conversation where the model ignores every user correction: 0.84.\nTRACE for the same conversation: 0.61.\n\nTRACE has five components — fact retention, self-contradiction, correction retention, topic coherence, confidence stability. Benchmarked on 102 conversations with Llama-3.1-8B. TRACE separates failure categories with a range of 0.277. BERTScore range is 0.044. The model retains user corrections 25% of the time. No per-turn metric can detect this.\n\nPyPi Package: trace-score · PyPI\n\ngithub.com\n\n### GitHub - Giri530/trace-score\n\nContribute to Giri530/trace-score development by creating an account on GitHub.",
  "title": "TRACE Score — a metric for multi-turn LLM consistency"
}