{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreigdant6dwrkzc64367fkkmnqokfbhfh7mfzwdduhnpxj3jgndal7y",
    "uri": "at://did:plc:jo3wjj2gx46alocis4wubmwr/app.bsky.feed.post/3mghiwp4axqm2"
  },
  "path": "/blog/2026/03/07/html-sentence-segmentation/",
  "publishedAt": "2026-03-06T23:30:00.000Z",
  "site": "https://thottingal.in",
  "tags": [
    "html-sentence-segmenter"
  ],
  "textContent": "If you have ever needed to work with sentences inside an HTML page — highlight them, translate them, read them aloud — you quickly run into a deceptively awkward problem. The text is not plain text. It is interspersed with tags, attributes, inline elements, and markup that your sentence detector has no business reading.\n\nThis post walks through an exploratory JavaScript project — html-sentence-segmenter — that I built to figure out how to do this properly. It is not a polished, reusable library. Think of it as a working proof-of-concept that demonstrates the approach, with a live demo using Wikipedia articles.",
  "title": "How to identify and annotate sentences in an HTML page"
}