06 — Semantic Cache: Similarity, TTL, and Invalidation¶
Problem: Exact-match caches miss paraphrases. Pure semantic caches return stale answers after the knowledge base changes.
In this notebook: Tiny in-memory cache keyed by embedding similarity plus a version tag; show hit/miss when KB version bumps.
In [ ]:
import time
import numpy as np
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
class SemanticCache:
def __init__(self, threshold: float = 0.82):
self.threshold = threshold
self.entries: list[tuple[str, str, int, np.ndarray]] = [] # query, answer, kb_version, emb
def get(self, q: str, kb_version: int):
qe = model.encode(q, convert_to_tensor=True, show_progress_bar=False)
best = (-1.0, None)
for _, ans, ver, emb in self.entries:
if ver != kb_version:
continue
sim = float(util.cos_sim(qe, emb))
if sim > best[0]:
best = (sim, ans)
if best[0] >= self.threshold:
return "HIT", best[1], best[0]
return "MISS", None, best[0]
def set(self, q: str, answer: str, kb_version: int):
qe = model.encode(q, convert_to_tensor=True, show_progress_bar=False)
self.entries.append((q, answer, kb_version, qe))
cache = SemanticCache(threshold=0.85)
cache.set("What is the refund window?", "30 days from invoice.", kb_version=1)
for q in ["How long do refunds take?", "What is the refund window?"]:
status, ans, sim = cache.get(q, kb_version=1)
print(repr(q), "->", status, "sim=", round(sim, 3), "ans=", ans)
print("\nAfter KB bump to v2 (invalidates v1-only entries):")
status, ans, sim = cache.get("What is the refund window?", kb_version=2)
print(status, ans, sim)
Takeaways
- Pair semantic similarity with index/content version or ETag to avoid stale hits.
- Log false hits (user thumbs-down after cache hit) to tune thresholds.
- Consider two-tier: exact normalized key first, semantic second.