From 879c4e744fd4878b5d971b9bd40fde568c5d8d6c Mon Sep 17 00:00:00 2001 From: Georg Brunmayr Date: Thu, 7 May 2026 15:13:23 +0200 Subject: [PATCH] feat: add ability to enable text file indexing via cli and mcp --- src/semble/cli.py | 24 ++++++++++++++++++++++-- src/semble/mcp.py | 22 +++++++++++++++++----- tests/test_cli.py | 24 ++++++++++++++++++++++++ tests/test_mcp.py | 15 +++++++++++++++ 4 files changed, 78 insertions(+), 7 deletions(-) diff --git a/src/semble/cli.py b/src/semble/cli.py index 509d99b..59c4add 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -34,13 +34,18 @@ def _mcp_main() -> None: help="Local directory or git URL to pre-index at startup (optional).", ) parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") + parser.add_argument( + "--include-text-files", + action="store_true", + help="Also index non-code text files (.md, .yaml, .json, etc.).", + ) args = parser.parse_args() if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")): print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr) raise SystemExit(1) from semble.mcp import serve - asyncio.run(serve(args.path, ref=args.ref)) + asyncio.run(serve(args.path, ref=args.ref, include_text_files=args.include_text_files)) def _run_init(*, force: bool = False) -> None: @@ -66,12 +71,22 @@ def _cli_main() -> None: search_p.add_argument( "-m", "--mode", default="hybrid", choices=["hybrid", "semantic", "bm25"], help="Search mode (default: hybrid)." ) + search_p.add_argument( + "--include-text-files", + action="store_true", + help="Also index non-code text files (.md, .yaml, .json, etc.).", + ) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") related_p.add_argument("file_path", help="File path as shown in search results.") related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + related_p.add_argument( + "--include-text-files", + action="store_true", + help="Also index non-code text files (.md, .yaml, .json, etc.).", + ) init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.") init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") @@ -82,7 +97,12 @@ def _cli_main() -> None: _run_init(force=args.force) return - index = SembleIndex.from_git(args.path) if _is_git_url(args.path) else SembleIndex.from_path(args.path) + include_text_files = args.include_text_files + index = ( + SembleIndex.from_git(args.path, include_text_files=include_text_files) + if _is_git_url(args.path) + else SembleIndex.from_path(args.path, include_text_files=include_text_files) + ) if args.command == "search": results = index.search(args.query, top_k=args.top_k, mode=args.mode) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 42e814d..9178bee 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -116,10 +116,10 @@ async def find_related( return server -async def serve(path: str | None = None, ref: str | None = None) -> None: +async def serve(path: str | None = None, ref: str | None = None, include_text_files: bool = False) -> None: """Start an MCP stdio server, optionally pre-indexing a default source.""" model = await asyncio.to_thread(load_model) - cache = _IndexCache(model=model) + cache = _IndexCache(model=model, include_text_files=include_text_files) if path: await cache.get(path, ref=ref) if not _is_git_url(path): @@ -132,9 +132,10 @@ async def serve(path: str | None = None, ref: str | None = None) -> None: class _IndexCache: """Cache of indexed repos and local paths for the lifetime of the MCP server process.""" - def __init__(self, model: Encoder) -> None: + def __init__(self, model: Encoder, include_text_files: bool = False) -> None: """Initialise an empty cache with a shared embedding model.""" self._model = model + self._include_text_files = include_text_files self._tasks: OrderedDict[str, asyncio.Task[SembleIndex]] = OrderedDict() # ordered for LRU eviction self._watcher_task: asyncio.Task[None] | None = None @@ -173,11 +174,22 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: self._tasks.popitem(last=False) if _is_git_url(source): self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread(SembleIndex.from_git, source, ref=ref, model=self._model) + asyncio.to_thread( + SembleIndex.from_git, + source, + ref=ref, + model=self._model, + include_text_files=self._include_text_files, + ) ) else: self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread(SembleIndex.from_path, cache_key, model=self._model) + asyncio.to_thread( + SembleIndex.from_path, + cache_key, + model=self._model, + include_text_files=self._include_text_files, + ) ) task = self._tasks[cache_key] try: diff --git a/tests/test_cli.py b/tests/test_cli.py index 0520b7a..e280450 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -195,6 +195,30 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err +def test_cli_search_passes_include_text_files(monkeypatch: pytest.MonkeyPatch) -> None: + """--include-text-files is forwarded to SembleIndex.from_path.""" + monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path", "--include-text-files"]) + fake_index = MagicMock() + fake_index.search.return_value = [] + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index) as mock_from_path: + _cli_main() + mock_from_path.assert_called_once_with("/some/path", include_text_files=True) + + +def test_cli_find_related_passes_include_text_files(monkeypatch: pytest.MonkeyPatch) -> None: + """--include-text-files is forwarded to SembleIndex.from_path for find-related.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + monkeypatch.setattr( + sys, "argv", ["semble", "find-related", "src/foo.py", "1", "/some/path", "--include-text-files"] + ) + fake_index = MagicMock() + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [] + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index) as mock_from_path: + _cli_main() + mock_from_path.assert_called_once_with("/some/path", include_text_files=True) + + def test_agent_file_tools_are_bash_only() -> None: """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" frontmatter = _CLAUDE_AGENT_FILE.split("---")[1] diff --git a/tests/test_mcp.py b/tests/test_mcp.py index f9a1c80..7edf18e 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -301,6 +301,21 @@ def test_cache_evict(cache: _IndexCache, tmp_path: Path) -> None: assert key not in cache._tasks +@pytest.mark.anyio +async def test_serve_passes_include_text_files(tmp_path: Path) -> None: + """serve(include_text_files=True) forwards the flag when building the index.""" + with ( + patch("semble.mcp.load_model", return_value=MagicMock(spec=Encoder)), + patch("semble.mcp.SembleIndex.from_path", return_value=MagicMock()) as mock_from_path, + patch.object(_IndexCache, "start_watcher", new_callable=AsyncMock), + patch("mcp.server.fastmcp.FastMCP.run_stdio_async", new_callable=AsyncMock), + ): + await serve(str(tmp_path), include_text_files=True) + + _, kwargs = mock_from_path.call_args + assert kwargs.get("include_text_files") is True + + def test_cache_evict_missing(cache: _IndexCache, tmp_path: Path) -> None: """evict() on an unknown path is a no-op.""" cache.evict(str(tmp_path)) # should not raise