Persistent 0% prompt cache hits on GPT-5.5 with Auckland NZ Cloudflare 520s complicating every workaround
Checklist - things that are likely non-cache inputs:
Calls to different models Calls with different service tier Calls with different prompt cache key Calls past expiry (5-60 minutes) Calls with framework injections of text such as UUIDs Prompt IDs with variables, varying prompt ID versions Not passing and maintaining a full chat history Varying or dropping encrypted reasoning, phase in output being returned Responses with any kind of compaction
Possible: different localization routing, different organization or project, etc. OpenAI running different determinism fingerprint models on varying hardware vs 24hr retrieval, etc.
Then the big one: Your actual API call, instructions + input is simply non-varying, only adding new inputs to a record of 100% fidelity.
Just from and for inspiration, I asked my AI pal starting with C for some tooling, a start of inspecting past string sequences you are sending in logs or “live”. Then far more “demo” presentation than needed for a token encoder + integer list matcher when you run.
"""
token_cache_diff.py
-------------------
Compares two tiktoken-encoded integer sequences to find where their shared
prefix ends, and reports whether that prefix qualifies for OpenAI's prompt
caching discount (≥ 1024 tokens, counted in 128-token increments).
Typical use: encode your prompt at each API call and pass both encoded
sequences here to pinpoint where early content mutations break cache
eligibility between runs.
"""
from __future__ import annotations
import random
from dataclasses import dataclass
from typing import Optional
try:
import tiktoken # only needed for the encode helper
except ImportError:
tiktoken = None # type: ignore
# ─────────────────────────────────────────────────────────────────────────────
# Result container
# ─────────────────────────────────────────────────────────────────────────────
@dataclass(frozen=True)
class TokenDiffResult:
"""Outcome of comparing two token-integer sequences."""
matching_prefix_len: int
"""Number of tokens identical from index 0 up to (not including) the first break."""
divergence_index: Optional[int]
"""Index of the first mismatched token, or None when one sequence is a
clean prefix of the other (identical, extension, or truncation)."""
divergence_type: str
"""
'identical' – sequences are byte-for-byte the same.
'extension' – candidate grew beyond reference with no mutations.
'truncation' – candidate is shorter than reference with no mutations.
'mutation' – a token value differs at divergence_index.
"""
divergent_tokens: Optional[tuple[int, int]]
"""(reference_token, candidate_token) at the divergence point, or None."""
cache_eligible_len: int
"""Largest prefix length that qualifies for a caching discount.
0 if the matching prefix is under the minimum threshold."""
cache_tiers_hit: int
"""How many 128-token cache tiers are covered by cache_eligible_len."""
# ─────────────────────────────────────────────────────────────────────────────
# Core comparison
# ─────────────────────────────────────────────────────────────────────────────
def compare_token_sequences(
reference: list[int],
candidate: list[int],
cache_min_tokens: int = 1024,
cache_increment: int = 128,
) -> TokenDiffResult:
"""
Compare two tiktoken integer sequences and report where they first diverge.
A shared prefix is valid only when every token from index 0 up to (but not
including) the first divergence is identical. A sequence that is strictly
longer with no mutations is treated as a clean extension, not a mutation.
Args:
reference: The baseline / earlier token sequence.
candidate: The later token sequence being compared.
cache_min_tokens: Minimum matching prefix for a caching discount (default 1024).
cache_increment: Cache-tier size in tokens (default 128).
Returns:
TokenDiffResult with the matching length, divergence position/type,
and the largest cache-eligible prefix length.
Examples:
>>> compare_token_sequences([1, 2, 3], [1, 2, 3]).divergence_type
'identical'
>>> compare_token_sequences([1, 2, 3], [1, 2, 3, 4]).divergence_type
'extension'
>>> compare_token_sequences([1, 2, 3, 4], [1, 2, 3]).divergence_type
'truncation'
>>> r = compare_token_sequences([1, 2, 9, 4], [1, 2, 3, 4])
>>> r.divergence_index, r.matching_prefix_len
(2, 2)
"""
min_len = min(len(reference), len(candidate))
# Walk only the overlapping portion looking for the first mismatch.
divergence_index: Optional[int] = None
for i in range(min_len):
if reference[i] != candidate[i]:
divergence_index = i
break
# Matching prefix is everything before the break (or the full overlap).
matching_prefix_len = divergence_index if divergence_index is not None else min_len
# Classify the relationship between the two sequences.
if divergence_index is not None:
divergence_type = "mutation"
elif len(reference) == len(candidate):
divergence_type = "identical"
elif len(candidate) > len(reference):
divergence_type = "extension"
else:
divergence_type = "truncation"
divergent_tokens: Optional[tuple[int, int]] = None
if divergence_index is not None:
divergent_tokens = (reference[divergence_index], candidate[divergence_index])
# Largest prefix that lands on a cache-tier boundary.
cache_eligible_len = 0
cache_tiers_hit = 0
if matching_prefix_len >= cache_min_tokens:
tiers = (matching_prefix_len - cache_min_tokens) // cache_increment
cache_eligible_len = cache_min_tokens + tiers * cache_increment
cache_tiers_hit = tiers + 1 # the first tier counts as tier 1
return TokenDiffResult(
matching_prefix_len=matching_prefix_len,
divergence_index=divergence_index,
divergence_type=divergence_type,
divergent_tokens=divergent_tokens,
cache_eligible_len=cache_eligible_len,
cache_tiers_hit=cache_tiers_hit,
)
# ─────────────────────────────────────────────────────────────────────────────
# Convenience wrapper — encodes text first, then compares
# ─────────────────────────────────────────────────────────────────────────────
def compare_text_inputs(
reference_text: str,
candidate_text: str,
model: str = "gpt-4o",
cache_min_tokens: int = 1024,
cache_increment: int = 128,
) -> TokenDiffResult:
"""
Encode both strings with tiktoken and delegate to compare_token_sequences.
Args:
reference_text: The earlier / baseline prompt string.
candidate_text: The later prompt string to compare.
model: The OpenAI model name used to select the tokeniser.
cache_min_tokens: Minimum matching prefix for a caching discount.
cache_increment: Cache-tier size in tokens.
Returns:
TokenDiffResult (same as compare_token_sequences).
Raises:
ImportError: if tiktoken is not installed.
"""
if tiktoken is None:
raise ImportError("tiktoken is required: pip install tiktoken")
enc = tiktoken.encoding_for_model(model)
return compare_token_sequences(
list(enc.encode(reference_text)),
list(enc.encode(candidate_text)),
cache_min_tokens=cache_min_tokens,
cache_increment=cache_increment,
)
# ─────────────────────────────────────────────────────────────────────────────
# Console display helpers
# ─────────────────────────────────────────────────────────────────────────────
_W = 72 # inner width of each box row (chars between the two ║ borders)
def _rule(char: str = "─") -> str:
return char * _W
def _header(title: str) -> str:
return (
f"╔{_rule('═')}╗\n"
f"║ {title:<{_W - 2}}║\n"
f"╠{_rule('═')}╣"
)
def _divider() -> str:
return f"╠{_rule('═')}╣"
def _footer() -> str:
return f"╚{_rule('═')}╝"
def _row(text: str = "") -> str:
return f"║{text:<{_W}}║"
def _body(lines: list[str]) -> str:
return "\n".join(_row(line) for line in lines)
def _print_box(title: str, sections: list[list[str]]) -> None:
"""Print a box with a title bar and one or more content sections."""
print(_header(title))
for i, section in enumerate(sections):
if i:
print(_divider())
print(_body(section))
print(_footer())
def _tier_bar(
eligible_len: int,
matched_len: int,
cache_min: int = 1024,
cache_inc: int = 128,
) -> str:
"""Compact tier bar: ▓ = covered tier, ░ = reachable but not yet crossed."""
if matched_len < cache_min:
return "n/a (below minimum threshold)"
max_tiers = (matched_len - cache_min) // cache_inc + 1
hit_tiers = (eligible_len - cache_min) // cache_inc + 1
bar = "▓" * hit_tiers + "░" * (max_tiers - hit_tiers)
next_boundary = cache_min + hit_tiers * cache_inc
tokens_to_next = next_boundary - matched_len
suffix = f" (+{tokens_to_next} to tier {hit_tiers + 1})" if tokens_to_next > 0 else ""
return f"[{bar}] {hit_tiers}/{max_tiers}{suffix}"
def _result_rows(
result: TokenDiffResult,
ref_len: int,
cand_len: int,
cache_min: int = 1024,
cache_inc: int = 128,
) -> list[str]:
"""Build the content rows for a comparison result section inside a box."""
rows: list[str] = []
delta = cand_len - ref_len
sign = "+" if delta >= 0 else ""
rows.append(f" Reference length : {ref_len:,} tokens")
rows.append(f" Candidate length : {cand_len:,} tokens ({sign}{delta:,})")
rows.append("")
icon = {
"identical": "≡", "extension": "→", "truncation": "←", "mutation": "✗"
}.get(result.divergence_type, "?")
rows.append(f" Divergence type : {icon} {result.divergence_type}")
rows.append(f" Prefix matched : {result.matching_prefix_len:,} tokens (raw)")
if result.divergence_index is not None:
rt, ct = result.divergent_tokens # type: ignore[misc]
rows.append(
f" First break : index {result.divergence_index:,}"
f" [ref={rt} cand={ct}]"
)
rows.append("")
if result.cache_eligible_len:
rows.append(f" Cache-eligible : {result.cache_eligible_len:,} tokens")
rows.append(
" Tier bar : "
+ _tier_bar(result.cache_eligible_len, result.matching_prefix_len,
cache_min, cache_inc)
)
rows.append("")
rows.append(f" ✓ Valid cache prefix — {result.cache_tiers_hit} tier(s) covered")
else:
rows.append(f" Cache-eligible : 0 (need >= {cache_min:,} matching tokens)")
short_by = cache_min - result.matching_prefix_len
rows.append(
f" Raw prefix only : {result.matching_prefix_len:,} tokens"
+ (f" (short by {short_by:,})" if short_by > 0 else "")
)
rows.append("")
rows.append(" ✗ No cache discount — prefix too short or mutated")
return rows
# ─────────────────────────────────────────────────────────────────────────────
# Demo — simulated multi-turn chat context with caching diagnostics
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
SEED = 42
CACHE_MIN = 1024
CACHE_INC = 128
random.seed(SEED)
# ── Build simulated token sequences ──────────────────────────────────────
# Token IDs are random integers in the realistic GPT-4o tiktoken range.
BASE_LEN = 1_500 # system prompt + previous conversation context
ROUND1_LEN = 210 # first new user message (Turn 1)
ROUND2_LEN = 195 # second new user message (Turn 2)
base = [random.randint(1, 50_256) for _ in range(BASE_LEN)]
round1 = [random.randint(1, 50_256) for _ in range(ROUND1_LEN)]
round2 = [random.randint(1, 50_256) for _ in range(ROUND2_LEN)]
seq_r0 = base # 1,500 — initial cached context
seq_r1 = base + round1 # 1,710 — after Round 1
seq_r2 = base + round1 + round2 # 1,905 — after Round 2
# Branch: silently mutate one token deep inside the base context, then
# re-append the same round1 and round2 suffixes. Total length unchanged.
MUTATION_IDX = 47
mutated_base = base[:]
mutated_base[MUTATION_IDX] = (mutated_base[MUTATION_IDX] + 999) % 50_256
seq_branch = mutated_base + round1 + round2 # 1,905 — index 47 is wrong
# ── Intro ─────────────────────────────────────────────────────────────────
print()
_print_box(
" TOKEN CACHE PREFIX DIFF — MULTI-TURN DEMO",
[[
" Simulated tiktoken integer sequences (no real model call needed).",
" Each turn compares the previous full prompt against the new one,",
" mirroring how you would call compare_token_sequences() in practice.",
"",
f" Cache discount rule: prefix >= {CACHE_MIN:,} tokens, aligned to",
f" {CACHE_INC}-token tiers (1024 -> 1152 -> 1280 -> 1408 -> ...)",
"",
" Tier bar key: ▓ = cache tier covered ░ = tier within reach",
]],
)
print()
# ── Turn 0: base context seeded ───────────────────────────────────────────
_print_box(
" TURN 0 · Base Context Seeded (seed=42)",
[[
f" {BASE_LEN:,} tokens generated — system prompt + prior assistant turns",
" already present in the context window.",
"",
" Stored as the cache reference. No comparison yet.",
]],
)
print()
# ── Turn 1: first user round ──────────────────────────────────────────────
r1 = compare_token_sequences(seq_r0, seq_r1, CACHE_MIN, CACHE_INC)
_print_box(
" TURN 1 · Round 1 User Input (+210 tokens appended)",
[
[
f" {ROUND1_LEN} new user-message tokens appended to the base context.",
" ref = stored cache (Turn 0) cand = new full prompt",
],
_result_rows(r1, len(seq_r0), len(seq_r1), CACHE_MIN, CACHE_INC),
],
)
print()
# ── Turn 2: second user round ─────────────────────────────────────────────
r2 = compare_token_sequences(seq_r1, seq_r2, CACHE_MIN, CACHE_INC)
tier_delta = r2.cache_tiers_hit - r1.cache_tiers_hit
if tier_delta > 0:
tier_note = (
f" ↑ +{tier_delta} tier(s) vs Turn 1 — grew past"
f" {tier_delta} x {CACHE_INC}-token boundary(s)."
)
else:
tier_note = " — No new cache tier boundary crossed since Turn 1."
_print_box(
" TURN 2 · Round 2 User Input (+195 tokens appended)",
[
[
f" {ROUND2_LEN} more tokens appended. Context keeps growing cleanly.",
" ref = Turn 1 full prompt cand = Turn 2 full prompt",
],
_result_rows(r2, len(seq_r1), len(seq_r2), CACHE_MIN, CACHE_INC),
[tier_note],
],
)
print()
# ── Turn 3: branch / mutation ─────────────────────────────────────────────
r3 = compare_token_sequences(seq_r2, seq_branch, CACHE_MIN, CACHE_INC)
_print_box(
" TURN 3 · Branch — Early Mutation Detected",
[
[
f" Token at index {MUTATION_IDX} was silently changed inside the base context.",
f" Total length is unchanged ({len(seq_branch):,} tokens) — mutation is subtle.",
" ref = Turn 2 full prompt cand = mutated branch",
],
_result_rows(r3, len(seq_r2), len(seq_branch), CACHE_MIN, CACHE_INC),
[
f" ⚠ Prefix breaks at index {r3.divergence_index}."
f" All {r2.cache_tiers_hit} previously earned tier(s) wiped.",
" Server must recompute the KV-cache from scratch.",
"",
" Common causes of early mutation:",
" · Timestamp / request-ID injected into system prompt",
" · Dynamic fields (username, locale) placed before static content",
" · Tool-call results inserted ahead of the stable context block",
],
],
)
print()
Discussion in the ATmosphere