diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 3a47eb62f524..85ee3ff11468 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -529,6 +530,12 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem { /// The user is responsible for synchronization of calls to this function. void EnsureFinalized(); +/// \brief Return whether a path string is likely a URI. +/// +/// This heuristic is conservative and may return false for malformed URIs. +ARROW_EXPORT +bool IsLikelyUri(std::string_view path); + /// \defgroup filesystem-factories Functions for creating FileSystem instances /// /// @{ diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index dc82afd07e7a..e56788137319 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -337,6 +337,8 @@ bool IsEmptyPath(std::string_view v) { return true; } +} // namespace internal + bool IsLikelyUri(std::string_view v) { if (v.empty() || v[0] == '/') { return false; @@ -357,6 +359,8 @@ bool IsLikelyUri(std::string_view v) { return ::arrow::util::IsValidUriScheme(v.substr(0, pos)); } +namespace internal { + struct Globber::Impl { std::regex pattern_; diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index d49d9d2efa7f..d146a57fbe10 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -27,6 +27,10 @@ namespace arrow { namespace fs { + +// Declared in arrow/filesystem/filesystem.h, defined in path_util.cc. +ARROW_EXPORT bool IsLikelyUri(std::string_view path); + namespace internal { constexpr char kSep = '/'; @@ -159,8 +163,7 @@ std::string ToSlashes(std::string_view s); ARROW_EXPORT bool IsEmptyPath(std::string_view s); -ARROW_EXPORT -bool IsLikelyUri(std::string_view s); +inline bool IsLikelyUri(std::string_view s) { return ::arrow::fs::IsLikelyUri(s); } class ARROW_EXPORT Globber { public: diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 0739b6acba32..0b8ab7480f83 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -84,6 +84,14 @@ def _file_type_to_string(ty): return f"{ty.__class__.__name__}.{ty._name_}" +def _is_likely_uri(path): + cdef c_string c_path + if not isinstance(path, str): + raise TypeError("Path must be a string") + c_path = tobytes(path) + return CIsLikelyUri(cpp_string_view(c_path)) + + cdef class FileInfo(_Weakrefable): """ FileSystem entry info. diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 670ccaaf2455..bc68d3f2d7cd 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -33,6 +33,7 @@ PyFileSystem, _copy_files, _copy_files_selector, + _is_likely_uri, ) # For backward compatibility. @@ -174,13 +175,17 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False): filesystem, path = FileSystem.from_uri(path) except ValueError as e: msg = str(e) - if "empty scheme" in msg or "Cannot parse URI" in msg: - # neither an URI nor a locally existing path, so assume that - # local path was given and propagate a nicer file not found - # error instead of a more confusing scheme parsing error + if "empty scheme" in msg: + # No scheme at all — treat as a local path and propagate + # a nicer "file not found" error later. + pass + elif "Cannot parse URI" in msg and not _is_likely_uri(path): + # Path doesn't look like a URI (no valid scheme prefix), + # so treat it as a local path rather than surfacing a + # confusing URI-parsing error. pass else: - raise e + raise else: path = filesystem.normalize_path(path) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index af01c47c8c7b..178fa7650bba 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -93,6 +93,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: "arrow::fs::FileSystemFromUriOrPath"(const c_string& uri, c_string* out_path) + c_bool CIsLikelyUri "arrow::fs::IsLikelyUri"(cpp_string_view path) + cdef cppclass CFileSystemGlobalOptions \ "arrow::fs::FileSystemGlobalOptions": c_string tls_ca_file_path diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 376398baa07e..31f3f8a8cab2 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1695,6 +1695,69 @@ def test_filesystem_from_path_object(path): assert path == p.resolve().absolute().as_posix() +def test_is_likely_uri(): + """Unit tests for the _is_likely_uri() heuristic.""" + from pyarrow.fs import _is_likely_uri + + # Valid URI schemes + assert _is_likely_uri("s3://bucket/key") + assert _is_likely_uri("gs://bucket/key") + assert _is_likely_uri("hdfs://host/path") + assert _is_likely_uri("file:///local/path") + assert _is_likely_uri("abfss://container@account/path") + assert _is_likely_uri("grpc+https://host:443") + + # Not URIs — local paths, Windows drives, empty, etc. + assert not _is_likely_uri("") + assert not _is_likely_uri("/absolute/path") + assert not _is_likely_uri("relative/path") + assert not _is_likely_uri("C:\\Users\\foo") # single-letter → drive + assert not _is_likely_uri("C:/Users/foo") + assert not _is_likely_uri("3bucket://key") # scheme starts with digit + assert not _is_likely_uri("-scheme://key") # scheme starts with dash + assert not _is_likely_uri("schéme://bucket/key") # non-ASCII in scheme + assert not _is_likely_uri("漢字://bucket/key") # non-ASCII in scheme + + +def test_resolve_filesystem_and_path_uri_with_spaces(): + """ + URIs with a recognised scheme but un-encoded spaces must raise + ValueError — NOT silently fall back to LocalFileSystem. + (GH-41365) + """ + from pyarrow.fs import _resolve_filesystem_and_path + + # S3 URI with spaces should raise, not return LocalFileSystem + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path("s3://bucket/path with space/file.parquet") + + # GCS URI with spaces should also raise + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path("gs://bucket/path with space/file.csv") + + # abfss URI with spaces + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path( + "abfss://container@account/dir with space/file" + ) + + +def test_resolve_filesystem_and_path_local_with_spaces(): + """ + Local paths (no scheme) with spaces should still resolve to + LocalFileSystem — they must NOT be confused with malformed URIs. + """ + from pyarrow.fs import _resolve_filesystem_and_path + + # Absolute local path with spaces → LocalFileSystem + fs, path = _resolve_filesystem_and_path("/tmp/path with spaces/data") + assert isinstance(fs, LocalFileSystem) + + # Non-existent absolute path → still LocalFileSystem + fs, path = _resolve_filesystem_and_path("/nonexistent/path") + assert isinstance(fs, LocalFileSystem) + + @pytest.mark.s3 def test_filesystem_from_uri_s3(s3_server): from pyarrow.fs import S3FileSystem