Raw Record Source

{
  "$type": "site.standard.document",
  "bskyPostRef": {
    "cid": "bafyreia6ica5ji2qeesdztjw4hzts6dv5i6udvxnek5hq2jy5p3zutcgim",
    "uri": "at://did:plc:s76si3rsl34tgvya7appi2q2/app.bsky.feed.post/3mehmwo5gvja2"
  },
  "path": "/outlier-and-collapse-the-enron-corpus-and-foundation-model-training-data/",
  "publishedAt": "2026-02-09T03:09:32.000Z",
  "site": "https://www.bespacific.com",
  "tags": [
    "AI",
    "Economy",
    "Education",
    "Energy",
    "Financial System",
    "Internet",
    "Knowledge Management",
    "Legal Research"
  ],
  "textContent": "Zimmer, Z. (2026). Outlier and collapse: The enron corpus and foundation model training data. Big Data & Society, 13(1). https://doi.org/10.1177/20539517261421474 (Original work published 2026) – “The Enron Corpus is a canonical training dataset representing one of the first scale jumps in the size of natural language data for machine learning (ML) research. That corpus was ...",
  "title": "Outlier and collapse: The enron corpus and foundation model training data"
}