Raw Record Source

{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreid3iovkinlxf5qms6h2e2gwveqzo2kx3skjebahnz2tghytgenbty",
    "uri": "at://did:plc:jo3wjj2gx46alocis4wubmwr/app.bsky.feed.post/3mftynsqrgmy2"
  },
  "path": "/blog/2026/02/27/malayalam-tokenizer-llm/",
  "publishedAt": "2026-02-26T23:30:00.000Z",
  "site": "https://thottingal.in",
  "textContent": "Standard LLMs fragment Malayalam words into 15+ meaningless pieces, destroying the semantic signal required for learning. This post details the training of custom BPE and Unigram tokenizers, and explores why resolving fragmentation is the necessary first step toward solving the larger problems of data scarcity and complex morphology",
  "title": "The Broken Token: Tokenization for Malayalam Language Models"
}