Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ uv add semble # Or install with uv

To update Semble, see [Updating](#updating).

Curious how many tokens Semble has saved you? Run `semble savings` to see. See [Savings](#savings) for details.

## Main Features

- **Fast**: indexes an average repo in ~250 ms and answers queries in ~1.5 ms, all on CPU.
Expand Down Expand Up @@ -185,6 +187,29 @@ semble find-related src/auth.py 42 ./my-project

If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.

### Savings

`semble savings` shows how many tokens semble has saved across all your searches:

```bash
semble savings # summary by period
semble savings --verbose # also show breakdown by call type
```

```
Semble Token Savings
════════════════════════════════════════════════════════════════
Period Calls Savings
────────────────────────────────────────────────────────────────
Today 42 [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘] ~58.4k tokens (95%)
Last 7 days 287 [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘] ~312.4k tokens (90%)
All time 1.4k [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘] ~1.2M tokens (89%)
```

**How savings are calculated:** for each call, semble records the total character count of the unique files containing returned chunks and the character count of the snippets returned. Estimated tokens saved is `(file chars βˆ’ snippet chars) / 4` (4 chars per token). This is a conservative estimate: the baseline is reading matched files in full, which is how coding agents often explore unfamiliar code.

Stats are stored in `~/.semble/savings.jsonl`.

### Updating

To update/upgrade Semble to the latest version:
Expand Down
10 changes: 9 additions & 1 deletion src/semble/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from model2vec.utils import get_package_extras

from semble.index import SembleIndex
from semble.stats import format_savings_report
from semble.utils import _format_results, _is_git_url, _resolve_chunk

_CLAUDE_FILE_PATH = Path(".claude") / "agents" / "semble-search.md"
_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "-h", "--help"})
_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help"})


def main() -> None:
Expand Down Expand Up @@ -91,12 +92,19 @@ def _cli_main() -> None:
init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.")
init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.")

savings_p = sub.add_parser("savings", help="Show token savings and usage stats.")
savings_p.add_argument("--verbose", action="store_true", help="Also show usage breakdown by call type.")

args = parser.parse_args()

if args.command == "init":
_run_init(force=args.force)
return

if args.command == "savings":
print(format_savings_report(verbose=args.verbose), end="")
return

include_text = args.include_text_files
index = (
SembleIndex.from_git(args.path, include_text_files=include_text)
Expand Down
48 changes: 33 additions & 15 deletions src/semble/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from semble.index.create import create_index_from_path
from semble.index.dense import SelectableBasicBackend, load_model
from semble.search import search_bm25, search_hybrid, search_semantic
from semble.types import Chunk, Encoder, IndexStats, SearchMode, SearchResult
from semble.stats import save_search_stats
from semble.types import CallType, Chunk, Encoder, IndexStats, SearchMode, SearchResult

_GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))

Expand All @@ -27,18 +28,22 @@ def __init__(
bm25_index: BM25,
semantic_index: SelectableBasicBackend,
chunks: list[Chunk],
root: Path | None = None,
) -> None:
"""Internal constructor β€” use :meth:`from_path` or :meth:`from_git`.
"""Initialize a SembleIndex. Should be created with from_path or from_git.

:param model: Embedding model to use.
:param bm25_index: The bm25 index.
:param semantic_index: The semantic index.
:param chunks: The found chunks.
:param root: Root directory used to read file sizes for token-savings stats.
"""
self.model: Encoder = model
self.chunks: list[Chunk] = chunks
self._bm25_index: BM25 = bm25_index
self._semantic_index: SelectableBasicBackend = semantic_index
self._root: Path | None = root
self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
self._file_mapping, self._language_mapping = self._populate_mapping()

def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]:
Expand All @@ -53,6 +58,18 @@ def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]

return dict(file_to_id), dict(language_to_id)

def _compute_file_sizes(self, root: Path) -> dict[str, int]:
"""Return a mapping of repo-relative file path to total character count."""
sizes: dict[str, int] = {}
for chunk in self.chunks:
if chunk.file_path in sizes:
continue
try:
sizes[chunk.file_path] = len((root / chunk.file_path).read_text(encoding="utf-8", errors="replace"))
except OSError:
pass
return sizes

@property
def stats(self) -> IndexStats:
"""Stats of an index."""
Expand Down Expand Up @@ -103,9 +120,7 @@ def from_path(
display_root=path,
)

index = SembleIndex(model, bm25, vicinity, chunks)

return index
return SembleIndex(model, bm25, vicinity, chunks, root=path)

@classmethod
def from_git(
Expand Down Expand Up @@ -157,9 +172,7 @@ def from_git(
display_root=resolved_path,
)

index = SembleIndex(model, bm25, vicinity, chunks)

return index
return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path)

def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
"""Return chunks semantically similar to the given chunk or search result.
Expand All @@ -171,7 +184,9 @@ def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[
target = source.chunk if isinstance(source, SearchResult) else source
selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None
results = search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector)
return [r for r in results if r.chunk != target][:top_k]
results = [r for r in results if r.chunk != target][:top_k]
save_search_stats(results, CallType.FIND_RELATED, self._file_sizes)
return results

def _get_selector_vector(
self, filter_languages: list[str] | None = None, filter_paths: list[str] | None = None
Expand Down Expand Up @@ -216,11 +231,14 @@ def search(
selector = self._get_selector_vector(filter_languages, filter_paths)

if mode == SearchMode.BM25:
return search_bm25(query, bm25_index, self.chunks, top_k, selector=selector)
if mode == SearchMode.SEMANTIC:
return search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector)
if mode == SearchMode.HYBRID:
return search_hybrid(
results = search_bm25(query, bm25_index, self.chunks, top_k, selector=selector)
elif mode == SearchMode.SEMANTIC:
results = search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector)
elif mode == SearchMode.HYBRID:
results = search_hybrid(
query, self.model, semantic_index, bm25_index, self.chunks, top_k, alpha=alpha, selector=selector
)
raise ValueError(f"Unknown search mode: {mode!r}")
else:
raise ValueError(f"Unknown search mode: {mode!r}")
save_search_stats(results, CallType.SEARCH, self._file_sizes)
return results
140 changes: 140 additions & 0 deletions src/semble/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import json
import logging
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path

from semble.types import CallType, SearchResult

logger = logging.getLogger(__name__)

_STATS_FILE = Path.home() / ".semble" / "savings.jsonl"


@dataclass
class BucketStats:
calls: int = 0
snippet_chars: int = 0
file_chars: int = 0

def add(self, snippet_chars: int, file_chars: int) -> None:
"""Update stats with a call and its character counts."""
self.calls += 1
self.snippet_chars += snippet_chars
self.file_chars += file_chars


@dataclass
class SavingsSummary:
buckets: dict[str, BucketStats]
call_type_counts: dict[str, int]


def save_search_stats(
results: list[SearchResult],
call_type: CallType,
file_sizes: dict[str, int],
) -> None:
"""Save stats about a search or find_related call to the stats file."""
try:
snippet_chars = sum(len(result.chunk.content) for result in results)
file_chars = sum(
file_sizes[path] for path in {result.chunk.file_path for result in results} if path in file_sizes
)

record = {
"ts": datetime.now(timezone.utc).timestamp(),
"call": call_type,
"results": len(results),
"snippet_chars": snippet_chars,
"file_chars": file_chars,
}
_STATS_FILE.parent.mkdir(parents=True, exist_ok=True)
with _STATS_FILE.open("a") as f:
f.write(json.dumps(record) + "\n")
except OSError:
pass


def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary:
"""Read savings.jsonl and return a SavingsSummary."""
now = datetime.now(timezone.utc)
today = now.date()
seven_days_ago = (now - timedelta(days=7)).date()

buckets = {
"Today": BucketStats(),
"Last 7 days": BucketStats(),
"All time": BucketStats(),
}
call_type_counts: defaultdict[str, int] = defaultdict(int)

with path.open() as f:
for line in f:
try:
record = json.loads(line)
except json.JSONDecodeError:
logger.warning("Skipping malformed JSON line in stats file")
continue
Comment thread
Pringled marked this conversation as resolved.
snippet_chars = record["snippet_chars"]
file_chars = record["file_chars"]
call_type = record["call"]
call_type_counts[call_type] += 1
dt = datetime.fromtimestamp(record["ts"], tz=timezone.utc)
in_today = dt.date() == today
in_last_7 = dt.date() > seven_days_ago
buckets["All time"].add(snippet_chars, file_chars)
if in_last_7:
buckets["Last 7 days"].add(snippet_chars, file_chars)
if in_today:
buckets["Today"].add(snippet_chars, file_chars)

return SavingsSummary(buckets=buckets, call_type_counts=dict(call_type_counts))


def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str:
"""Return a formatted token-savings report."""
if path is None:
path = _STATS_FILE
if not path.exists():
return "No stats yet. Run a search first."

summary = build_savings_summary(path)
bar_width = 16
heavy_line = " " + "═" * 64
light_line = " " + "─" * 64

lines = [
"",
" Semble Token Savings",
heavy_line,
f" {'Period':<12} {'Calls':<6} Savings",
light_line,
]
for label, bucket in summary.buckets.items():
saved_chars = max(0, bucket.file_chars - bucket.snippet_chars)
saved_tokens = saved_chars // 4 # standard ~4 chars/token approximation
Comment thread
Pringled marked this conversation as resolved.
if saved_tokens >= 1_000_000:
saved_str = f"~{saved_tokens / 1_000_000:.1f}M"
elif saved_tokens >= 1000:
saved_str = f"~{saved_tokens / 1000:.1f}k"
else:
saved_str = f"~{saved_tokens}"
calls_str = f"{bucket.calls / 1000:.1f}k" if bucket.calls >= 1000 else str(bucket.calls)
if bucket.file_chars > 0:
ratio = saved_chars / bucket.file_chars
filled = round(ratio * bar_width)
bar = "β–ˆ" * filled + "β–‘" * (bar_width - filled)
pct = round(ratio * 100)
lines.append(f" {label:<12} {calls_str:<6} [{bar}] {saved_str} tokens ({pct}%)")
else:
lines.append(f" {label:<12} {calls_str:<6} [{'β–‘' * bar_width}] {saved_str} tokens")
if verbose and summary.call_type_counts:
lines += ["", " Usage Breakdown", light_line, f" {'Call type':<16} Calls"]
for call_type, count in sorted(summary.call_type_counts.items()):
count_str = f"{count / 1000:.1f}k" if count >= 1000 else str(count)
lines.append(f" {call_type:<16} {count_str}")
lines.append(heavy_line)
lines.append("")
return "\n".join(lines)
7 changes: 7 additions & 0 deletions src/semble/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ class SearchMode(str, Enum):
BM25 = "bm25"


class CallType(str, Enum):
"""Call type for token-savings tracking."""

SEARCH = "search"
FIND_RELATED = "find_related"


class Encoder(Protocol):
"""Protocol for embedding models."""

Expand Down
20 changes: 20 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from semble import SembleIndex
from semble.index.create import _MAX_FILE_BYTES, create_index_from_path
from semble.types import Encoder
from tests.conftest import make_chunk


@pytest.fixture
Expand Down Expand Up @@ -87,6 +88,25 @@ def test_search_empty_query_returns_empty(indexed_index: SembleIndex, mode: str,
assert indexed_index.search(query, mode=mode) == []


@pytest.mark.parametrize(
("disk_files", "chunk_paths", "expected"),
[
({"foo.py": "hello world"}, ["foo.py", "foo.py"], {"foo.py": 11}),
({}, ["nonexistent.py"], {}),
],
ids=["dedup-same-file", "missing-file-skipped"],
)
def test_compute_file_sizes(
tmp_path: Path, disk_files: dict[str, str], chunk_paths: list[str], expected: dict[str, int]
) -> None:
"""_compute_file_sizes deduplicates paths and silently skips missing files."""
for name, content in disk_files.items():
(tmp_path / name).write_text(content)
index = SembleIndex.__new__(SembleIndex)
index.chunks = [make_chunk("c", p) for p in chunk_paths]
assert index._compute_file_sizes(tmp_path) == expected


def test_find_related(indexed_index: SembleIndex) -> None:
"""find_related returns related chunks for a Chunk or SearchResult seed."""
chunk = indexed_index.chunks[0]
Expand Down
Loading
Loading