From 2056fec96ef6b6dd0b040fd895547f87f05d148a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 9 Mar 2026 17:11:40 +0100 Subject: [PATCH 1/3] inject docstrings into stubs --- python/CMakeLists.txt | 28 +++++++ python/scripts/update_stub_docstrings.py | 95 ++++++++++++++++++------ 2 files changed, 101 insertions(+), 22 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0630e0cff7cb..d7e0b70939fa 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1025,3 +1025,31 @@ if(PYARROW_BUILD_PARQUET) target_link_libraries(_parquet_encryption PRIVATE arrow_python_parquet_encryption) endif() endif() + +# +# Type stubs with docstring injection +# +set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_SOURCE_DIR}/pyarrow-stubs/pyarrow") +if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") + install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/" + DESTINATION "." + FILES_MATCHING + PATTERN "*.pyi") + + if(DEFINED SKBUILD_STATE + AND SKBUILD_STATE STREQUAL "wheel" + AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + install(CODE " + execute_process( + COMMAND \"${Python3_EXECUTABLE}\" + \"${CMAKE_SOURCE_DIR}/scripts/update_stub_docstrings.py\" + \"${CMAKE_INSTALL_PREFIX}\" + \"${CMAKE_SOURCE_DIR}\" + RESULT_VARIABLE _pyarrow_stub_docstrings_result + ) + if(NOT _pyarrow_stub_docstrings_result EQUAL 0) + message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + endif() + ") + endif() +endif() diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index 5fd24014a024..a405b052a371 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -18,14 +18,18 @@ """ Extract docstrings from pyarrow runtime and insert them into stub files. -Usage (from python/ directory with pyarrow built): - python scripts/update_stub_docstrings.py pyarrow-stubs +Usage: + python scripts/update_stub_docstrings.py """ import argparse import importlib import inspect +import os +import shutil import sys +import sysconfig +import tempfile from pathlib import Path from textwrap import indent @@ -186,7 +190,8 @@ def add_docstrings_to_stubs(stubs_dir): if module_name in LIB_MODULES: namespace = "lib" elif stub_file.parent.name in ("parquet", "interchange"): - namespace = f"{stub_file.parent.name}.{module_name}" + namespace = (stub_file.parent.name if module_name == "__init__" + else f"{stub_file.parent.name}.{module_name}") elif module_name == "__init__": namespace = "" else: @@ -198,31 +203,77 @@ def add_docstrings_to_stubs(stubs_dir): stub_file.write_text(modified.code) -def add_docstrings_from_build(stubs_dir, build_lib): - """ - Entry point for setup.py: update docstrings using pyarrow from build directory. +def _link_or_copy(source, destination): + if sys.platform != "win32": + try: + os.symlink(source, destination) + return + except OSError: + pass + + if source.is_dir(): + shutil.copytree(source, destination, symlinks=(sys.platform != "win32")) + else: + shutil.copy2(source, destination) + - During the build process, pyarrow is not installed in the system Python. - We need to temporarily add the build directory to sys.path so we can - import pyarrow and extract docstrings from it. +def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): """ - stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) + Populate pyarrow_pkg with source Python modules and installed binary artifacts + so that pyarrow can be imported from the parent directory of pyarrow_pkg. + """ + ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ".so" + source_pyarrow = source_dir / "pyarrow" + if not source_pyarrow.exists(): + raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}") + + for source_path in source_pyarrow.iterdir(): + if source_path.suffix == ".py": + _link_or_copy(source_path, pyarrow_pkg / source_path.name) + elif source_path.is_dir() and not source_path.name.startswith((".", "__")): + _link_or_copy(source_path, pyarrow_pkg / source_path.name) + + for artifact in install_pyarrow_dir.iterdir(): + if not artifact.is_file(): + continue - sys.path.insert(0, str(build_lib)) - try: - add_docstrings_to_stubs(stubs_dir) - finally: - sys.path.pop(0) + destination = pyarrow_pkg / artifact.name + if destination.exists(): + continue + + is_extension = ext_suffix in artifact.name or artifact.suffix == ".pyd" + is_shared_library = ( + ".so" in artifact.name or artifact.suffix in (".dylib", ".dll") + ) + if is_extension or is_shared_library: + _link_or_copy(artifact, destination) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder") + parser.add_argument("install_prefix", type=Path, + help="CMAKE_INSTALL_PREFIX used by wheel build") + parser.add_argument("source_dir", type=Path, + help="PyArrow source directory") args = parser.parse_args() - # Add the directory containing this script's parent (python/) to sys.path - # so pyarrow can be imported when running from the python/ directory - script_dir = Path(__file__).resolve().parent - python_dir = script_dir.parent - sys.path.insert(0, str(python_dir)) - add_docstrings_to_stubs(args.stubs_dir.resolve()) + install_prefix = args.install_prefix.resolve() + source_dir = args.source_dir.resolve() + install_pyarrow_dir = install_prefix / "pyarrow" + if not install_pyarrow_dir.exists(): + install_pyarrow_dir = install_prefix + + if not any(install_pyarrow_dir.rglob("*.pyi")): + print("No .pyi files found in install tree, skipping docstring injection") + sys.exit(0) + + with tempfile.TemporaryDirectory() as tmpdir: + pyarrow_pkg = Path(tmpdir) / "pyarrow" + pyarrow_pkg.mkdir() + _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir) + + sys.path.insert(0, tmpdir) + try: + add_docstrings_to_stubs(install_pyarrow_dir) + finally: + sys.path.pop(0) From f885111a427b36c803b25f37316ec912e864e893 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 10 Mar 2026 01:32:33 +0100 Subject: [PATCH 2/3] check stubs are included at wheel build time --- ci/scripts/python_test_type_annotations.sh | 4 +- ci/scripts/python_wheel_validate_contents.py | 41 ++++++++++++++++++++ python/CMakeLists.txt | 31 +++++++++++++-- python/pyproject.toml | 11 ++++-- 4 files changed, 77 insertions(+), 10 deletions(-) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index c1a051b1e56d..092bedf3f5ea 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -34,5 +34,5 @@ pip install mypy pyright ty # Run type checkers cd "${pyarrow_dir}" mypy -pyright -ty check +pyright --stats +ty check --verbose --output-format concise diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 153a70eb4069..493811c1258b 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -27,6 +27,7 @@ def validate_wheel(path): error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" assert len(wheels) == 1, error_msg f = zipfile.ZipFile(wheels[0]) + outliers = [ info.filename for info in f.filelist if not re.match( r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename @@ -37,6 +38,46 @@ def validate_wheel(path): assert any(info.filename.split("/")[-1] == filename for info in f.filelist), \ f"{filename} is missing from the wheel." + + assert any(info.filename == "pyarrow/py.typed" for info in f.filelist), \ + "pyarrow/py.typed is missing from the wheel." + + source_root = Path(__file__).resolve().parents[2] + stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow" + assert stubs_dir.exists(), f"Stub source directory not found: {stubs_dir}" + + expected_stub_files = { + f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}" + for stub_file in stubs_dir.rglob("*.pyi") + } + + wheel_stub_files = { + info.filename + for info in f.filelist + if info.filename.startswith("pyarrow/") and info.filename.endswith(".pyi") + } + + assert wheel_stub_files == expected_stub_files, ( + "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n" + f"Missing in wheel: {sorted(expected_stub_files - wheel_stub_files)}\n" + f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" + ) + + docstring_injected_stub_files = [] + for wheel_stub_file in wheel_stub_files: + stub_relpath = Path(wheel_stub_file).relative_to("pyarrow") + source_stub_file = stubs_dir / stub_relpath + source_content = source_stub_file.read_text(encoding="utf-8") + wheel_content = f.read(wheel_stub_file).decode("utf-8") + if wheel_content.count('"""') > source_content.count('"""'): + docstring_injected_stub_files.append(wheel_stub_file) + + assert docstring_injected_stub_files, ( + "No injected docstrings were detected in wheel stub files. " + "Expected at least one .pyi file in the wheel to contain more " + "triple-quoted docstrings than its source stub counterpart." + ) + print(f"The wheel: {wheels[0]} seems valid.") # TODO(GH-32609): Validate some docstrings were generated and added. diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index d7e0b70939fa..c9d9b7aa82c1 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1029,7 +1029,18 @@ endif() # # Type stubs with docstring injection # -set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_SOURCE_DIR}/pyarrow-stubs/pyarrow") +# Stubs live in pyarrow-stubs/pyarrow/ during development but are installed +# alongside the package so type checkers can find them (PEP 561). +set(PYARROW_REQUIRE_STUB_DOCSTRINGS OFF) +if(DEFINED SKBUILD_STATE + AND SKBUILD_STATE STREQUAL "wheel" + AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" + AND DEFINED ENV{CI} + AND NOT "$ENV{CI}" STREQUAL "") + set(PYARROW_REQUIRE_STUB_DOCSTRINGS ON) +endif() + +set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pyarrow-stubs/pyarrow") if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/" DESTINATION "." @@ -1042,14 +1053,26 @@ if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") install(CODE " execute_process( COMMAND \"${Python3_EXECUTABLE}\" - \"${CMAKE_SOURCE_DIR}/scripts/update_stub_docstrings.py\" + \"${CMAKE_CURRENT_SOURCE_DIR}/scripts/update_stub_docstrings.py\" \"${CMAKE_INSTALL_PREFIX}\" - \"${CMAKE_SOURCE_DIR}\" + \"${CMAKE_CURRENT_SOURCE_DIR}\" RESULT_VARIABLE _pyarrow_stub_docstrings_result ) if(NOT _pyarrow_stub_docstrings_result EQUAL 0) - message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + if(${PYARROW_REQUIRE_STUB_DOCSTRINGS}) + message(FATAL_ERROR \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + else() + message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + endif() endif() ") endif() +else() + if(PYARROW_REQUIRE_STUB_DOCSTRINGS) + message(FATAL_ERROR "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " + "cannot build CI wheel without .pyi files.") + else() + message(WARNING "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " + "wheel will be built without .pyi files.") + endif() endif() diff --git a/python/pyproject.toml b/python/pyproject.toml index 14aa37ed0453..7ed2dce51a92 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -85,7 +85,7 @@ exclude = [ [tool.scikit-build] cmake.build-type = "Release" metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" -sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"] +sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/", "pyarrow-stubs/"] wheel.packages = ["pyarrow"] wheel.install-dir = "pyarrow" @@ -102,7 +102,7 @@ version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' -# TODO: Enable type checking once stubs are merged +# TODO: Enable more type checks as more stubs are merged [tool.mypy] files = ["pyarrow-stubs"] mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" @@ -113,7 +113,7 @@ exclude = [ "^scripts/", ] -# TODO: Enable type checking once stubs are merged +# TODO: Enable more type checks as more stubs are merged [tool.pyright] pythonPlatform = "All" pythonVersion = "3.10" @@ -128,7 +128,10 @@ exclude = [ stubPath = "pyarrow-stubs" typeCheckingMode = "basic" -# TODO: Enable type checking once stubs are merged +# TODO: Enable more type checks as more stubs are merged +[tool.ty.environment] +extra-paths = ["pyarrow-stubs"] + [tool.ty.src] include = ["pyarrow-stubs"] exclude = [ From e4903ebbed2c3f5839d2e901b4ebf1dd2e1d7683 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 10 Mar 2026 13:42:47 +0100 Subject: [PATCH 3/3] we don't have docstrings yet --- ci/scripts/python_wheel_validate_contents.py | 36 ++++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 493811c1258b..52753eb51ded 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -16,11 +16,27 @@ # under the License. import argparse +import ast from pathlib import Path import re import zipfile +def _count_docstrings(source): + """Count docstrings in module, function, and class bodies.""" + tree = ast.parse(source) + count = 0 + for node in ast.walk(tree): + if isinstance(node, (ast.Module, ast.FunctionDef, + ast.AsyncFunctionDef, ast.ClassDef)): + if (node.body + and isinstance(node.body[0], ast.Expr) + and isinstance(node.body[0].value, ast.Constant) + and isinstance(node.body[0].value.value, str)): + count += 1 + return count + + def validate_wheel(path): p = Path(path) wheels = list(p.glob('*.whl')) @@ -63,23 +79,15 @@ def validate_wheel(path): f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" ) - docstring_injected_stub_files = [] - for wheel_stub_file in wheel_stub_files: - stub_relpath = Path(wheel_stub_file).relative_to("pyarrow") - source_stub_file = stubs_dir / stub_relpath - source_content = source_stub_file.read_text(encoding="utf-8") - wheel_content = f.read(wheel_stub_file).decode("utf-8") - if wheel_content.count('"""') > source_content.count('"""'): - docstring_injected_stub_files.append(wheel_stub_file) - - assert docstring_injected_stub_files, ( - "No injected docstrings were detected in wheel stub files. " - "Expected at least one .pyi file in the wheel to contain more " - "triple-quoted docstrings than its source stub counterpart." + wheel_docstring_count = sum( + _count_docstrings(f.read(wsf).decode("utf-8")) + for wsf in wheel_stub_files ) + print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.") + assert wheel_docstring_count, "No docstrings found in wheel stub files." + print(f"The wheel: {wheels[0]} seems valid.") - # TODO(GH-32609): Validate some docstrings were generated and added. def main(): parser = argparse.ArgumentParser()