import sys
from pathlib import Path

_REPO = Path.cwd().resolve()
if (_REPO / "src").is_dir():
    sys.path.insert(0, str(_REPO / "src"))

from rag_series_utils import chunk_fixed_size

policy_doc = (
    "## Refund policy\n"
    "Enterprise customers may request a full refund within 30 days of the invoice date.\n"
    "The billing dispute code is POL-ENT-7721 — include it in tickets.\n"
    "\n"
    "## API rate limits\n"
    "Standard tier allows 100 requests per minute per API key."
)

flat = policy_doc.replace("\n", " ")
fixed = chunk_fixed_size(flat, chunk_size=120, overlap=20)
print(f"Fixed-size chunks ({len(fixed)} total):\n")
for i, c in enumerate(fixed, 1):
    print(f"--- chunk {i} ({len(c)} chars) ---")
    print(c)
    print()

import sys
from pathlib import Path

_REPO = Path.cwd().resolve()
if (_REPO / "src").is_dir():
    sys.path.insert(0, str(_REPO / "src"))

from rag_series_utils import chunk_by_paragraphs

policy_doc = (
    "## Refund policy\n"
    "Enterprise customers may request a full refund within 30 days of the invoice date.\n"
    "The billing dispute code is POL-ENT-7721 — include it in tickets.\n"
    "\n"
    "## API rate limits\n"
    "Standard tier allows 100 requests per minute per API key."
)

semantic = chunk_by_paragraphs(policy_doc, max_chars=400)
print(f"Semantic (paragraph) chunks ({len(semantic)} total):\n")
for i, c in enumerate(semantic, 1):
    print(f"--- chunk {i} ---")
    print(c)
    print()

import sys
from pathlib import Path

_REPO = Path.cwd().resolve()
if (_REPO / "src").is_dir():
    sys.path.insert(0, str(_REPO / "src"))

from rag_series_utils import chunk_by_headings

policy_doc = (
    "## Refund policy\n"
    "Enterprise customers may request a full refund within 30 days of the invoice date.\n"
    "The billing dispute code is POL-ENT-7721 — include it in tickets.\n"
    "\n"
    "## API rate limits\n"
    "Standard tier allows 100 requests per minute per API key."
)

hier = chunk_by_headings(policy_doc)
print(f"Hierarchical chunks ({len(hier)} total):\n")
for i, c in enumerate(hier, 1):
    print(f"--- chunk {i} ---")
    print(c)
    print()

import time
import numpy as np
from sentence_transformers import SentenceTransformer, util

docs = [
    "Refund policy: enterprise customers may request a refund within 30 days of invoice.",
    "API rate limits: standard tier allows 100 requests per minute per API key.",
    "Security: rotate API keys every 90 days and store them in a secrets manager.",
    "Billing: usage is metered monthly; overages are charged at the published rate card.",
    "Support SLAs: priority incidents receive first response within one business hour.",
]

query = "How long do I have to get my money back after purchase?"

models = {
    "fast_small": "sentence-transformers/all-MiniLM-L6-v2",
    "slower_larger": "sentence-transformers/all-mpnet-base-v2",
}

results = []
for label, name in models.items():
    t0 = time.perf_counter()
    model = SentenceTransformer(name)
    load_s = time.perf_counter() - t0
    t1 = time.perf_counter()
    doc_emb = model.encode(docs, convert_to_tensor=True, show_progress_bar=False)
    q_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
    enc_s = time.perf_counter() - t1
    sims = util.cos_sim(q_emb, doc_emb)[0]
    top_i = int(np.argmax(sims.cpu().numpy()))
    results.append(
        {
            "label": label,
            "model": name,
            "dim": doc_emb.shape[1],
            "load_s": round(load_s, 2),
            "encode_s": round(enc_s, 4),
            "top_i": top_i,
            "top_score": float(sims[top_i]),
            "top_doc": docs[top_i][:80] + "...",
        }
    )

from tabulate import tabulate
print(tabulate([{k: v for k, v in r.items() if k != "top_doc"} for r in results], headers="keys"))
print()
for r in results:
    print(r["label"], "->", r["top_doc"])

Layer	What it controls	Examples
Chunking	What one “document” in the index represents	Fixed window, paragraph/semantic splits, hierarchical sections
Embedding model	How meaning is compressed into a vector	`all-MiniLM-L6-v2` vs `all-mpnet-base-v2`, multilingual, domain-tuned
Downstream	Quality vs cost	Index size, query latency, recall on paraphrases

01 — Not All Vectors Are Equal: Embedding Choice¶

Step 0 — What you are choosing¶

Step 1 — Fixed-size (sliding window) chunking¶

Step 2 — Semantic chunking (paragraph / blank-line boundaries)¶

Step 3 — Hierarchical chunking (structure / headings)¶

More choices (short list)¶

Step 4 — Embedding model choice (same chunks, two encoders)¶

Takeaways¶