External Publication
Visit Post

Persistent 0% prompt cache hits on GPT-5.5 with Auckland NZ Cloudflare 520s complicating every workaround

OpenAI Developer Community June 17, 2026
Source

Checklist - things that are likely non-cache inputs:

Calls to different models Calls with different service tier Calls with different prompt cache key Calls past expiry (5-60 minutes) Calls with framework injections of text such as UUIDs Prompt IDs with variables, varying prompt ID versions Not passing and maintaining a full chat history Varying or dropping encrypted reasoning, phase in output being returned Responses with any kind of compaction

Possible: different localization routing, different organization or project, etc. OpenAI running different determinism fingerprint models on varying hardware vs 24hr retrieval, etc.

Then the big one: Your actual API call, instructions + input is simply non-varying, only adding new inputs to a record of 100% fidelity.

Just from and for inspiration, I asked my AI pal starting with C for some tooling, a start of inspecting past string sequences you are sending in logs or “live”. Then far more “demo” presentation than needed for a token encoder + integer list matcher when you run.

"""
token_cache_diff.py
-------------------
Compares two tiktoken-encoded integer sequences to find where their shared
prefix ends, and reports whether that prefix qualifies for OpenAI's prompt
caching discount (≥ 1024 tokens, counted in 128-token increments).

Typical use: encode your prompt at each API call and pass both encoded
sequences here to pinpoint where early content mutations break cache
eligibility between runs.
"""

from __future__ import annotations

import random
from dataclasses import dataclass
from typing import Optional

try:
    import tiktoken  # only needed for the encode helper
except ImportError:
    tiktoken = None  # type: ignore


# ─────────────────────────────────────────────────────────────────────────────
# Result container
# ─────────────────────────────────────────────────────────────────────────────

@dataclass(frozen=True)
class TokenDiffResult:
    """Outcome of comparing two token-integer sequences."""

    matching_prefix_len: int
    """Number of tokens identical from index 0 up to (not including) the first break."""

    divergence_index: Optional[int]
    """Index of the first mismatched token, or None when one sequence is a
    clean prefix of the other (identical, extension, or truncation)."""

    divergence_type: str
    """
    'identical'  – sequences are byte-for-byte the same.
    'extension'  – candidate grew beyond reference with no mutations.
    'truncation' – candidate is shorter than reference with no mutations.
    'mutation'   – a token value differs at divergence_index.
    """

    divergent_tokens: Optional[tuple[int, int]]
    """(reference_token, candidate_token) at the divergence point, or None."""

    cache_eligible_len: int
    """Largest prefix length that qualifies for a caching discount.
    0 if the matching prefix is under the minimum threshold."""

    cache_tiers_hit: int
    """How many 128-token cache tiers are covered by cache_eligible_len."""


# ─────────────────────────────────────────────────────────────────────────────
# Core comparison
# ─────────────────────────────────────────────────────────────────────────────

def compare_token_sequences(
    reference: list[int],
    candidate: list[int],
    cache_min_tokens: int = 1024,
    cache_increment: int = 128,
) -> TokenDiffResult:
    """
    Compare two tiktoken integer sequences and report where they first diverge.

    A shared prefix is valid only when every token from index 0 up to (but not
    including) the first divergence is identical.  A sequence that is strictly
    longer with no mutations is treated as a clean extension, not a mutation.

    Args:
        reference:        The baseline / earlier token sequence.
        candidate:        The later token sequence being compared.
        cache_min_tokens: Minimum matching prefix for a caching discount (default 1024).
        cache_increment:  Cache-tier size in tokens (default 128).

    Returns:
        TokenDiffResult with the matching length, divergence position/type,
        and the largest cache-eligible prefix length.

    Examples:
        >>> compare_token_sequences([1, 2, 3], [1, 2, 3]).divergence_type
        'identical'
        >>> compare_token_sequences([1, 2, 3], [1, 2, 3, 4]).divergence_type
        'extension'
        >>> compare_token_sequences([1, 2, 3, 4], [1, 2, 3]).divergence_type
        'truncation'
        >>> r = compare_token_sequences([1, 2, 9, 4], [1, 2, 3, 4])
        >>> r.divergence_index, r.matching_prefix_len
        (2, 2)
    """
    min_len = min(len(reference), len(candidate))

    # Walk only the overlapping portion looking for the first mismatch.
    divergence_index: Optional[int] = None
    for i in range(min_len):
        if reference[i] != candidate[i]:
            divergence_index = i
            break

    # Matching prefix is everything before the break (or the full overlap).
    matching_prefix_len = divergence_index if divergence_index is not None else min_len

    # Classify the relationship between the two sequences.
    if divergence_index is not None:
        divergence_type = "mutation"
    elif len(reference) == len(candidate):
        divergence_type = "identical"
    elif len(candidate) > len(reference):
        divergence_type = "extension"
    else:
        divergence_type = "truncation"

    divergent_tokens: Optional[tuple[int, int]] = None
    if divergence_index is not None:
        divergent_tokens = (reference[divergence_index], candidate[divergence_index])

    # Largest prefix that lands on a cache-tier boundary.
    cache_eligible_len = 0
    cache_tiers_hit = 0
    if matching_prefix_len >= cache_min_tokens:
        tiers = (matching_prefix_len - cache_min_tokens) // cache_increment
        cache_eligible_len = cache_min_tokens + tiers * cache_increment
        cache_tiers_hit = tiers + 1  # the first tier counts as tier 1

    return TokenDiffResult(
        matching_prefix_len=matching_prefix_len,
        divergence_index=divergence_index,
        divergence_type=divergence_type,
        divergent_tokens=divergent_tokens,
        cache_eligible_len=cache_eligible_len,
        cache_tiers_hit=cache_tiers_hit,
    )


# ─────────────────────────────────────────────────────────────────────────────
# Convenience wrapper — encodes text first, then compares
# ─────────────────────────────────────────────────────────────────────────────

def compare_text_inputs(
    reference_text: str,
    candidate_text: str,
    model: str = "gpt-4o",
    cache_min_tokens: int = 1024,
    cache_increment: int = 128,
) -> TokenDiffResult:
    """
    Encode both strings with tiktoken and delegate to compare_token_sequences.

    Args:
        reference_text:   The earlier / baseline prompt string.
        candidate_text:   The later prompt string to compare.
        model:            The OpenAI model name used to select the tokeniser.
        cache_min_tokens: Minimum matching prefix for a caching discount.
        cache_increment:  Cache-tier size in tokens.

    Returns:
        TokenDiffResult (same as compare_token_sequences).

    Raises:
        ImportError: if tiktoken is not installed.
    """
    if tiktoken is None:
        raise ImportError("tiktoken is required: pip install tiktoken")

    enc = tiktoken.encoding_for_model(model)
    return compare_token_sequences(
        list(enc.encode(reference_text)),
        list(enc.encode(candidate_text)),
        cache_min_tokens=cache_min_tokens,
        cache_increment=cache_increment,
    )


# ─────────────────────────────────────────────────────────────────────────────
# Console display helpers
# ─────────────────────────────────────────────────────────────────────────────

_W = 72  # inner width of each box row (chars between the two ║ borders)


def _rule(char: str = "─") -> str:
    return char * _W


def _header(title: str) -> str:
    return (
        f"╔{_rule('═')}╗\n"
        f"║  {title:<{_W - 2}}║\n"
        f"╠{_rule('═')}╣"
    )


def _divider() -> str:
    return f"╠{_rule('═')}╣"


def _footer() -> str:
    return f"╚{_rule('═')}╝"


def _row(text: str = "") -> str:
    return f"║{text:<{_W}}║"


def _body(lines: list[str]) -> str:
    return "\n".join(_row(line) for line in lines)


def _print_box(title: str, sections: list[list[str]]) -> None:
    """Print a box with a title bar and one or more content sections."""
    print(_header(title))
    for i, section in enumerate(sections):
        if i:
            print(_divider())
        print(_body(section))
    print(_footer())


def _tier_bar(
    eligible_len: int,
    matched_len: int,
    cache_min: int = 1024,
    cache_inc: int = 128,
) -> str:
    """Compact tier bar: ▓ = covered tier, ░ = reachable but not yet crossed."""
    if matched_len < cache_min:
        return "n/a  (below minimum threshold)"
    max_tiers = (matched_len - cache_min) // cache_inc + 1
    hit_tiers = (eligible_len - cache_min) // cache_inc + 1
    bar = "▓" * hit_tiers + "░" * (max_tiers - hit_tiers)
    next_boundary = cache_min + hit_tiers * cache_inc
    tokens_to_next = next_boundary - matched_len
    suffix = f"  (+{tokens_to_next} to tier {hit_tiers + 1})" if tokens_to_next > 0 else ""
    return f"[{bar}]  {hit_tiers}/{max_tiers}{suffix}"


def _result_rows(
    result: TokenDiffResult,
    ref_len: int,
    cand_len: int,
    cache_min: int = 1024,
    cache_inc: int = 128,
) -> list[str]:
    """Build the content rows for a comparison result section inside a box."""
    rows: list[str] = []
    delta = cand_len - ref_len
    sign = "+" if delta >= 0 else ""

    rows.append(f"  Reference length  : {ref_len:,} tokens")
    rows.append(f"  Candidate length  : {cand_len:,} tokens  ({sign}{delta:,})")
    rows.append("")

    icon = {
        "identical": "≡", "extension": "→", "truncation": "←", "mutation": "✗"
    }.get(result.divergence_type, "?")
    rows.append(f"  Divergence type   : {icon}  {result.divergence_type}")
    rows.append(f"  Prefix matched    : {result.matching_prefix_len:,} tokens  (raw)")

    if result.divergence_index is not None:
        rt, ct = result.divergent_tokens  # type: ignore[misc]
        rows.append(
            f"  First break       : index {result.divergence_index:,}"
            f"  [ref={rt}  cand={ct}]"
        )

    rows.append("")

    if result.cache_eligible_len:
        rows.append(f"  Cache-eligible    : {result.cache_eligible_len:,} tokens")
        rows.append(
            "  Tier bar          : "
            + _tier_bar(result.cache_eligible_len, result.matching_prefix_len,
                        cache_min, cache_inc)
        )
        rows.append("")
        rows.append(f"  ✓  Valid cache prefix — {result.cache_tiers_hit} tier(s) covered")
    else:
        rows.append(f"  Cache-eligible    : 0  (need >= {cache_min:,} matching tokens)")
        short_by = cache_min - result.matching_prefix_len
        rows.append(
            f"  Raw prefix only   : {result.matching_prefix_len:,} tokens"
            + (f"  (short by {short_by:,})" if short_by > 0 else "")
        )
        rows.append("")
        rows.append("  ✗  No cache discount — prefix too short or mutated")

    return rows


# ─────────────────────────────────────────────────────────────────────────────
# Demo — simulated multi-turn chat context with caching diagnostics
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    SEED      = 42
    CACHE_MIN = 1024
    CACHE_INC = 128
    random.seed(SEED)

    # ── Build simulated token sequences ──────────────────────────────────────
    # Token IDs are random integers in the realistic GPT-4o tiktoken range.

    BASE_LEN   = 1_500  # system prompt + previous conversation context
    ROUND1_LEN = 210    # first new user message (Turn 1)
    ROUND2_LEN = 195    # second new user message (Turn 2)

    base   = [random.randint(1, 50_256) for _ in range(BASE_LEN)]
    round1 = [random.randint(1, 50_256) for _ in range(ROUND1_LEN)]
    round2 = [random.randint(1, 50_256) for _ in range(ROUND2_LEN)]

    seq_r0 = base                        # 1,500 — initial cached context
    seq_r1 = base + round1               # 1,710 — after Round 1
    seq_r2 = base + round1 + round2      # 1,905 — after Round 2

    # Branch: silently mutate one token deep inside the base context, then
    # re-append the same round1 and round2 suffixes.  Total length unchanged.
    MUTATION_IDX  = 47
    mutated_base  = base[:]
    mutated_base[MUTATION_IDX] = (mutated_base[MUTATION_IDX] + 999) % 50_256
    seq_branch    = mutated_base + round1 + round2  # 1,905 — index 47 is wrong

    # ── Intro ─────────────────────────────────────────────────────────────────
    print()
    _print_box(
        "  TOKEN CACHE PREFIX DIFF — MULTI-TURN DEMO",
        [[
            "  Simulated tiktoken integer sequences (no real model call needed).",
            "  Each turn compares the previous full prompt against the new one,",
            "  mirroring how you would call compare_token_sequences() in practice.",
            "",
            f"  Cache discount rule: prefix >= {CACHE_MIN:,} tokens, aligned to",
            f"  {CACHE_INC}-token tiers  (1024 -> 1152 -> 1280 -> 1408 -> ...)",
            "",
            "  Tier bar key:  ▓ = cache tier covered   ░ = tier within reach",
        ]],
    )
    print()

    # ── Turn 0: base context seeded ───────────────────────────────────────────
    _print_box(
        "  TURN 0  ·  Base Context Seeded  (seed=42)",
        [[
            f"  {BASE_LEN:,} tokens generated — system prompt + prior assistant turns",
            "  already present in the context window.",
            "",
            "  Stored as the cache reference.  No comparison yet.",
        ]],
    )
    print()

    # ── Turn 1: first user round ──────────────────────────────────────────────
    r1 = compare_token_sequences(seq_r0, seq_r1, CACHE_MIN, CACHE_INC)

    _print_box(
        "  TURN 1  ·  Round 1 User Input  (+210 tokens appended)",
        [
            [
                f"  {ROUND1_LEN} new user-message tokens appended to the base context.",
                "  ref = stored cache (Turn 0)    cand = new full prompt",
            ],
            _result_rows(r1, len(seq_r0), len(seq_r1), CACHE_MIN, CACHE_INC),
        ],
    )
    print()

    # ── Turn 2: second user round ─────────────────────────────────────────────
    r2 = compare_token_sequences(seq_r1, seq_r2, CACHE_MIN, CACHE_INC)

    tier_delta = r2.cache_tiers_hit - r1.cache_tiers_hit
    if tier_delta > 0:
        tier_note = (
            f"  ↑  +{tier_delta} tier(s) vs Turn 1 — grew past"
            f" {tier_delta} x {CACHE_INC}-token boundary(s)."
        )
    else:
        tier_note = "  —  No new cache tier boundary crossed since Turn 1."

    _print_box(
        "  TURN 2  ·  Round 2 User Input  (+195 tokens appended)",
        [
            [
                f"  {ROUND2_LEN} more tokens appended.  Context keeps growing cleanly.",
                "  ref = Turn 1 full prompt       cand = Turn 2 full prompt",
            ],
            _result_rows(r2, len(seq_r1), len(seq_r2), CACHE_MIN, CACHE_INC),
            [tier_note],
        ],
    )
    print()

    # ── Turn 3: branch / mutation ─────────────────────────────────────────────
    r3 = compare_token_sequences(seq_r2, seq_branch, CACHE_MIN, CACHE_INC)

    _print_box(
        "  TURN 3  ·  Branch — Early Mutation Detected",
        [
            [
                f"  Token at index {MUTATION_IDX} was silently changed inside the base context.",
                f"  Total length is unchanged ({len(seq_branch):,} tokens) — mutation is subtle.",
                "  ref = Turn 2 full prompt       cand = mutated branch",
            ],
            _result_rows(r3, len(seq_r2), len(seq_branch), CACHE_MIN, CACHE_INC),
            [
                f"  ⚠  Prefix breaks at index {r3.divergence_index}."
                f"  All {r2.cache_tiers_hit} previously earned tier(s) wiped.",
                "  Server must recompute the KV-cache from scratch.",
                "",
                "  Common causes of early mutation:",
                "    · Timestamp / request-ID injected into system prompt",
                "    · Dynamic fields (username, locale) placed before static content",
                "    · Tool-call results inserted ahead of the stable context block",
            ],
        ],
    )
    print()

Discussion in the ATmosphere

Loading comments...