diff --git a/README.md b/README.md index 53f382f..b7de702 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ uv add semble # Or install with uv To update Semble, see [Updating](#updating). +Curious how many tokens Semble has saved you? Run `semble savings` to see. See [Savings](#savings) for details. + ## Main Features - **Fast**: indexes an average repo in ~250 ms and answers queries in ~1.5 ms, all on CPU. @@ -185,6 +187,29 @@ semble find-related src/auth.py 42 ./my-project If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. +### Savings + +`semble savings` shows how many tokens semble has saved across all your searches: + +```bash +semble savings # summary by period +semble savings --verbose # also show breakdown by call type +``` + +``` + Semble Token Savings + ════════════════════════════════════════════════════════════════ + Period Calls Savings + ──────────────────────────────────────────────────────────────── + Today 42 [███████████████░] ~58.4k tokens (95%) + Last 7 days 287 [██████████████░░] ~312.4k tokens (90%) + All time 1.4k [██████████████░░] ~1.2M tokens (89%) +``` + +**How savings are calculated:** for each call, semble records the total character count of the unique files containing returned chunks and the character count of the snippets returned. Estimated tokens saved is `(file chars − snippet chars) / 4` (4 chars per token). This is a conservative estimate: the baseline is reading matched files in full, which is how coding agents often explore unfamiliar code. + +Stats are stored in `~/.semble/savings.jsonl`. + ### Updating To update/upgrade Semble to the latest version: diff --git a/src/semble/cli.py b/src/semble/cli.py index e9de6de..0f97adb 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -8,10 +8,11 @@ from model2vec.utils import get_package_extras from semble.index import SembleIndex +from semble.stats import format_savings_report from semble.utils import _format_results, _is_git_url, _resolve_chunk _CLAUDE_FILE_PATH = Path(".claude") / "agents" / "semble-search.md" -_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "-h", "--help"}) +_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help"}) def main() -> None: @@ -91,12 +92,19 @@ def _cli_main() -> None: init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.") init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") + savings_p = sub.add_parser("savings", help="Show token savings and usage stats.") + savings_p.add_argument("--verbose", action="store_true", help="Also show usage breakdown by call type.") + args = parser.parse_args() if args.command == "init": _run_init(force=args.force) return + if args.command == "savings": + print(format_savings_report(verbose=args.verbose), end="") + return + include_text = args.include_text_files index = ( SembleIndex.from_git(args.path, include_text_files=include_text) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 4c4c764..24c1f36 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -13,7 +13,8 @@ from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model from semble.search import search_bm25, search_hybrid, search_semantic -from semble.types import Chunk, Encoder, IndexStats, SearchMode, SearchResult +from semble.stats import save_search_stats +from semble.types import CallType, Chunk, Encoder, IndexStats, SearchMode, SearchResult _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) @@ -27,18 +28,22 @@ def __init__( bm25_index: BM25, semantic_index: SelectableBasicBackend, chunks: list[Chunk], + root: Path | None = None, ) -> None: - """Internal constructor — use :meth:`from_path` or :meth:`from_git`. + """Initialize a SembleIndex. Should be created with from_path or from_git. :param model: Embedding model to use. :param bm25_index: The bm25 index. :param semantic_index: The semantic index. :param chunks: The found chunks. + :param root: Root directory used to read file sizes for token-savings stats. """ self.model: Encoder = model self.chunks: list[Chunk] = chunks self._bm25_index: BM25 = bm25_index self._semantic_index: SelectableBasicBackend = semantic_index + self._root: Path | None = root + self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]: @@ -53,6 +58,18 @@ def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]] return dict(file_to_id), dict(language_to_id) + def _compute_file_sizes(self, root: Path) -> dict[str, int]: + """Return a mapping of repo-relative file path to total character count.""" + sizes: dict[str, int] = {} + for chunk in self.chunks: + if chunk.file_path in sizes: + continue + try: + sizes[chunk.file_path] = len((root / chunk.file_path).read_text(encoding="utf-8", errors="replace")) + except OSError: + pass + return sizes + @property def stats(self) -> IndexStats: """Stats of an index.""" @@ -103,9 +120,7 @@ def from_path( display_root=path, ) - index = SembleIndex(model, bm25, vicinity, chunks) - - return index + return SembleIndex(model, bm25, vicinity, chunks, root=path) @classmethod def from_git( @@ -157,9 +172,7 @@ def from_git( display_root=resolved_path, ) - index = SembleIndex(model, bm25, vicinity, chunks) - - return index + return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. @@ -171,7 +184,9 @@ def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[ target = source.chunk if isinstance(source, SearchResult) else source selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None results = search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector) - return [r for r in results if r.chunk != target][:top_k] + results = [r for r in results if r.chunk != target][:top_k] + save_search_stats(results, CallType.FIND_RELATED, self._file_sizes) + return results def _get_selector_vector( self, filter_languages: list[str] | None = None, filter_paths: list[str] | None = None @@ -216,11 +231,14 @@ def search( selector = self._get_selector_vector(filter_languages, filter_paths) if mode == SearchMode.BM25: - return search_bm25(query, bm25_index, self.chunks, top_k, selector=selector) - if mode == SearchMode.SEMANTIC: - return search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector) - if mode == SearchMode.HYBRID: - return search_hybrid( + results = search_bm25(query, bm25_index, self.chunks, top_k, selector=selector) + elif mode == SearchMode.SEMANTIC: + results = search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector) + elif mode == SearchMode.HYBRID: + results = search_hybrid( query, self.model, semantic_index, bm25_index, self.chunks, top_k, alpha=alpha, selector=selector ) - raise ValueError(f"Unknown search mode: {mode!r}") + else: + raise ValueError(f"Unknown search mode: {mode!r}") + save_search_stats(results, CallType.SEARCH, self._file_sizes) + return results diff --git a/src/semble/stats.py b/src/semble/stats.py new file mode 100644 index 0000000..9975342 --- /dev/null +++ b/src/semble/stats.py @@ -0,0 +1,140 @@ +import json +import logging +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from pathlib import Path + +from semble.types import CallType, SearchResult + +logger = logging.getLogger(__name__) + +_STATS_FILE = Path.home() / ".semble" / "savings.jsonl" + + +@dataclass +class BucketStats: + calls: int = 0 + snippet_chars: int = 0 + file_chars: int = 0 + + def add(self, snippet_chars: int, file_chars: int) -> None: + """Update stats with a call and its character counts.""" + self.calls += 1 + self.snippet_chars += snippet_chars + self.file_chars += file_chars + + +@dataclass +class SavingsSummary: + buckets: dict[str, BucketStats] + call_type_counts: dict[str, int] + + +def save_search_stats( + results: list[SearchResult], + call_type: CallType, + file_sizes: dict[str, int], +) -> None: + """Save stats about a search or find_related call to the stats file.""" + try: + snippet_chars = sum(len(result.chunk.content) for result in results) + file_chars = sum( + file_sizes[path] for path in {result.chunk.file_path for result in results} if path in file_sizes + ) + + record = { + "ts": datetime.now(timezone.utc).timestamp(), + "call": call_type, + "results": len(results), + "snippet_chars": snippet_chars, + "file_chars": file_chars, + } + _STATS_FILE.parent.mkdir(parents=True, exist_ok=True) + with _STATS_FILE.open("a") as f: + f.write(json.dumps(record) + "\n") + except OSError: + pass + + +def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary: + """Read savings.jsonl and return a SavingsSummary.""" + now = datetime.now(timezone.utc) + today = now.date() + seven_days_ago = (now - timedelta(days=7)).date() + + buckets = { + "Today": BucketStats(), + "Last 7 days": BucketStats(), + "All time": BucketStats(), + } + call_type_counts: defaultdict[str, int] = defaultdict(int) + + with path.open() as f: + for line in f: + try: + record = json.loads(line) + except json.JSONDecodeError: + logger.warning("Skipping malformed JSON line in stats file") + continue + snippet_chars = record["snippet_chars"] + file_chars = record["file_chars"] + call_type = record["call"] + call_type_counts[call_type] += 1 + dt = datetime.fromtimestamp(record["ts"], tz=timezone.utc) + in_today = dt.date() == today + in_last_7 = dt.date() > seven_days_ago + buckets["All time"].add(snippet_chars, file_chars) + if in_last_7: + buckets["Last 7 days"].add(snippet_chars, file_chars) + if in_today: + buckets["Today"].add(snippet_chars, file_chars) + + return SavingsSummary(buckets=buckets, call_type_counts=dict(call_type_counts)) + + +def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str: + """Return a formatted token-savings report.""" + if path is None: + path = _STATS_FILE + if not path.exists(): + return "No stats yet. Run a search first." + + summary = build_savings_summary(path) + bar_width = 16 + heavy_line = " " + "═" * 64 + light_line = " " + "─" * 64 + + lines = [ + "", + " Semble Token Savings", + heavy_line, + f" {'Period':<12} {'Calls':<6} Savings", + light_line, + ] + for label, bucket in summary.buckets.items(): + saved_chars = max(0, bucket.file_chars - bucket.snippet_chars) + saved_tokens = saved_chars // 4 # standard ~4 chars/token approximation + if saved_tokens >= 1_000_000: + saved_str = f"~{saved_tokens / 1_000_000:.1f}M" + elif saved_tokens >= 1000: + saved_str = f"~{saved_tokens / 1000:.1f}k" + else: + saved_str = f"~{saved_tokens}" + calls_str = f"{bucket.calls / 1000:.1f}k" if bucket.calls >= 1000 else str(bucket.calls) + if bucket.file_chars > 0: + ratio = saved_chars / bucket.file_chars + filled = round(ratio * bar_width) + bar = "█" * filled + "░" * (bar_width - filled) + pct = round(ratio * 100) + lines.append(f" {label:<12} {calls_str:<6} [{bar}] {saved_str} tokens ({pct}%)") + else: + lines.append(f" {label:<12} {calls_str:<6} [{'░' * bar_width}] {saved_str} tokens") + if verbose and summary.call_type_counts: + lines += ["", " Usage Breakdown", light_line, f" {'Call type':<16} Calls"] + for call_type, count in sorted(summary.call_type_counts.items()): + count_str = f"{count / 1000:.1f}k" if count >= 1000 else str(count) + lines.append(f" {call_type:<16} {count_str}") + lines.append(heavy_line) + lines.append("") + return "\n".join(lines) diff --git a/src/semble/types.py b/src/semble/types.py index 418f06e..f598ba7 100644 --- a/src/semble/types.py +++ b/src/semble/types.py @@ -17,6 +17,13 @@ class SearchMode(str, Enum): BM25 = "bm25" +class CallType(str, Enum): + """Call type for token-savings tracking.""" + + SEARCH = "search" + FIND_RELATED = "find_related" + + class Encoder(Protocol): """Protocol for embedding models.""" diff --git a/tests/test_index.py b/tests/test_index.py index 9bae1cb..8e87ddd 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -6,6 +6,7 @@ from semble import SembleIndex from semble.index.create import _MAX_FILE_BYTES, create_index_from_path from semble.types import Encoder +from tests.conftest import make_chunk @pytest.fixture @@ -87,6 +88,25 @@ def test_search_empty_query_returns_empty(indexed_index: SembleIndex, mode: str, assert indexed_index.search(query, mode=mode) == [] +@pytest.mark.parametrize( + ("disk_files", "chunk_paths", "expected"), + [ + ({"foo.py": "hello world"}, ["foo.py", "foo.py"], {"foo.py": 11}), + ({}, ["nonexistent.py"], {}), + ], + ids=["dedup-same-file", "missing-file-skipped"], +) +def test_compute_file_sizes( + tmp_path: Path, disk_files: dict[str, str], chunk_paths: list[str], expected: dict[str, int] +) -> None: + """_compute_file_sizes deduplicates paths and silently skips missing files.""" + for name, content in disk_files.items(): + (tmp_path / name).write_text(content) + index = SembleIndex.__new__(SembleIndex) + index.chunks = [make_chunk("c", p) for p in chunk_paths] + assert index._compute_file_sizes(tmp_path) == expected + + def test_find_related(indexed_index: SembleIndex) -> None: """find_related returns related chunks for a Chunk or SearchResult seed.""" chunk = indexed_index.chunks[0] diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..831a9f6 --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,109 @@ +import json +import sys +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from semble.cli import _cli_main +from semble.stats import build_savings_summary, format_savings_report, save_search_stats +from semble.types import CallType, SearchMode, SearchResult +from tests.conftest import make_chunk + + +def _make_stats_record(ts: float, call: str = "search", snippet_chars: int = 1_000, file_chars: int = 20_000) -> str: + return json.dumps({"ts": ts, "call": call, "results": 3, "snippet_chars": snippet_chars, "file_chars": file_chars}) + + +@pytest.fixture +def sample_stats_file(tmp_path: Path) -> Path: + """Stats file with one search and one find_related record from today.""" + stats_file = tmp_path / "stats.jsonl" + now = datetime.now(timezone.utc).timestamp() + stats_file.write_text( + _make_stats_record(now, call="search") + "\n" + _make_stats_record(now, call="find_related") + "\n" + ) + return stats_file + + +def test_save_search_stats(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """save_search_stats deduplicates file paths and silences write errors.""" + chunk = make_chunk("hello", "src/foo.py") + result = SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID) + stats_file = tmp_path / "stats.jsonl" + monkeypatch.setattr("semble.stats._STATS_FILE", stats_file) + save_search_stats([result, result], CallType.SEARCH, {"src/foo.py": 42}) + assert json.loads(stats_file.read_text())["file_chars"] == 42 + + mock_path = MagicMock() + mock_path.parent.mkdir.return_value = None + mock_path.open.side_effect = OSError("no write") + monkeypatch.setattr("semble.stats._STATS_FILE", mock_path) + save_search_stats([result], CallType.SEARCH, {"src/foo.py": 42}) # must not raise + + +def test_savings_no_file(tmp_path: Path) -> None: + """format_savings_report returns a friendly message when no stats file exists yet.""" + assert "No stats yet" in format_savings_report(path=tmp_path / "nonexistent.jsonl") + + +@pytest.mark.parametrize( + ("verbose", "expected"), + [ + (False, ["Savings", "Today"]), + (True, ["Savings", "Today", "Usage Breakdown", "search", "find_related"]), + ], + ids=["default", "verbose"], +) +def test_savings_output(sample_stats_file: Path, verbose: bool, expected: list[str]) -> None: + """format_savings_report displays period buckets; --verbose adds call-type breakdown.""" + result = format_savings_report(path=sample_stats_file, verbose=verbose) + for s in expected: + assert s in result + + +def test_savings_output_millions(tmp_path: Path) -> None: + """Token counts >= 1M are formatted as M, not k.""" + stats_file = tmp_path / "stats.jsonl" + stats_file.write_text( + _make_stats_record(datetime.now(timezone.utc).timestamp(), snippet_chars=0, file_chars=4_000_000) + "\n" + ) + assert "M tokens" in format_savings_report(path=stats_file) + + +def test_savings_tolerates_bad_json(tmp_path: Path) -> None: + """Malformed JSON lines are skipped with a warning.""" + stats_file = tmp_path / "stats.jsonl" + stats_file.write_text("not valid json\n") + assert "Savings" in format_savings_report(path=stats_file) + + +@pytest.mark.parametrize( + ("argv", "expected"), + [ + (["semble", "savings"], "No stats yet"), + (["semble", "savings", "--verbose"], "No stats yet"), + ], + ids=["default", "verbose"], +) +def test_savings_cli_dispatch( + argv: list[str], expected: str, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """Savings subcommand dispatches to format_savings_report, with and without --verbose.""" + monkeypatch.setattr(sys, "argv", argv) + monkeypatch.setattr("semble.stats._STATS_FILE", tmp_path / "nonexistent.jsonl") + _cli_main() + assert expected in capsys.readouterr().out + + +def test_savings_buckets_exclude_old_records(tmp_path: Path) -> None: + """Records older than 7 days count in All time but not Today or Last 7 days.""" + stats_file = tmp_path / "stats.jsonl" + old_ts = datetime(2020, 1, 1, tzinfo=timezone.utc).timestamp() + now_ts = datetime.now(timezone.utc).timestamp() + stats_file.write_text(_make_stats_record(old_ts) + "\n" + _make_stats_record(now_ts) + "\n") + summary = build_savings_summary(path=stats_file) + assert summary.buckets["All time"].calls == 2 + assert summary.buckets["Today"].calls == 1 + assert summary.buckets["Last 7 days"].calls == 1