{
"$type": "site.standard.document",
"bskyPostRef": {
"cid": "bafyreihjo6u4ek4gwwu2qpkwd2vg4uqag2rj6j34kzuyzfei54d3hyojyi",
"uri": "at://did:plc:pgryn3ephfd2xgft23qokfzt/app.bsky.feed.post/3mjuc2ir4a7q2"
},
"path": "/t/trace-score-a-metric-for-multi-turn-llm-consistency/175383#post_1",
"publishedAt": "2026-04-19T14:26:45.000Z",
"site": "https://discuss.huggingface.co",
"tags": [
"trace-score · PyPI",
"github.com",
"GitHub - Giri530/trace-score"
],
"textContent": "Built a metric that evaluates the full conversation arc instead of individual turns.\n\nBERTScore for a conversation where the model ignores every user correction: 0.84.\nTRACE for the same conversation: 0.61.\n\nTRACE has five components — fact retention, self-contradiction, correction retention, topic coherence, confidence stability. Benchmarked on 102 conversations with Llama-3.1-8B. TRACE separates failure categories with a range of 0.277. BERTScore range is 0.044. The model retains user corrections 25% of the time. No per-turn metric can detect this.\n\nPyPi Package: trace-score · PyPI\n\ngithub.com\n\n### GitHub - Giri530/trace-score\n\nContribute to Giri530/trace-score development by creating an account on GitHub.",
"title": "TRACE Score — a metric for multi-turn LLM consistency"
}