Skip to content

Commit 286bf7c

Browse files
committed
ARROW-4637: [Python] Conditionally import pandas symbols if they are used. Do not require pandas as a test dependency
Warning: hold your nose for this one =) I think this can be made cleaner, but I just wanted to triage all the problems and make sure we don't introduce a hard dependency on pandas again Also resolves ARROW-4794: make pandas an optional dependency Author: Wes McKinney <[email protected]> Closes #3893 from wesm/ARROW-4637 and squashes the following commits: 3c353b6 <Wes McKinney> do not override orc global mark in test_orc.py 59fdf8a <Wes McKinney> Tweak pyarrow._orc import to see if it fixes MSVC 747c893 <Wes McKinney> Address Python 2.7 unicode interaction issue with Cython 4d94b2e <Wes McKinney> Add benchmark from ARROW-4629 f7bf774 <Wes McKinney> Cythonize pandas API shim for better performance 4a2549a <Wes McKinney> Remove TF testing from travis_script_manylinux1.sh 5061c6e <Wes McKinney> Do not require pandas to run unit tests 385cfe5 <Wes McKinney> Finish pandas API shim; do not eagerly import pandas, add to CI 804587a <Wes McKinney> add import test script bb0240e <Wes McKinney> Begin to refactor to make references to pandas more centralized and lazy
1 parent 9c33e1a commit 286bf7c

33 files changed

+756
-408
lines changed

ci/travis_script_manylinux.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,9 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do
5959
conda activate $CONDA_ENV_DIR
6060

6161
# install the produced wheels
62-
pip install tensorflow
6362
pip install dist/*.whl
6463

65-
# Test optional dependencies and the presence of tensorflow
64+
# Test optional dependencies
6665
python check_imports.py
6766

6867
# Install test dependencies and run pyarrow tests

ci/travis_script_python.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ python -c "import pyarrow.parquet"
166166
python -c "import pyarrow.plasma"
167167
python -c "import pyarrow.orc"
168168

169+
# Ensure we do eagerly import pandas (or other expensive imports)
170+
python < scripts/test_imports.py
171+
169172
echo "PLASMA_VALGRIND: $PLASMA_VALGRIND"
170173

171174
# Set up huge pages for plasma test

docs/source/developers/python.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,8 +360,6 @@ Now, we build and install Arrow C++ libraries
360360
set ARROW_HOME=C:\thirdparty
361361
cmake -G "Visual Studio 14 2015 Win64" ^
362362
-DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
363-
-DCMAKE_BUILD_TYPE=Release ^
364-
-DARROW_BUILD_TESTS=on ^
365363
-DARROW_CXXFLAGS="/WX /MP" ^
366364
-DARROW_GANDIVA=on ^
367365
-DARROW_PARQUET=on ^
@@ -380,7 +378,9 @@ Now, we can build pyarrow:
380378
.. code-block:: shell
381379
382380
cd python
383-
python setup.py build_ext --inplace --with-parquet
381+
set PYARROW_WITH_GANDIVA=1
382+
set PYARROW_WITH_PARQUET=1
383+
python setup.py build_ext --inplace
384384
385385
Then run the unit tests with:
386386

python/benchmarks/convert_pandas.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,17 @@ def time_serialize_pandas(self):
105105

106106
def time_deserialize_pandas(self):
107107
pa.deserialize_pandas(self.serialized)
108+
109+
110+
class TableFromPandasMicroperformance(object):
111+
# ARROW-4629
112+
113+
def setup(self):
114+
ser = pd.Series(range(10000))
115+
df = pd.DataFrame({col: ser.copy(deep=True) for col in range(100)})
116+
# Simulate a real dataset by converting some columns to strings
117+
self.df = df.astype({col: str for col in range(50)})
118+
119+
def time_Table_from_pandas(self):
120+
for _ in range(50):
121+
pa.Table.from_pandas(self.df, nthreads=1)

python/pyarrow/array.pxi

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,6 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
from pyarrow.compat import HAVE_PANDAS
19-
20-
if HAVE_PANDAS:
21-
import pyarrow.pandas_compat as pdcompat
22-
2318

2419
cdef _sequence_to_array(object sequence, object mask, object size,
2520
DataType type, CMemoryPool* pool, c_bool from_pandas):
@@ -46,12 +41,10 @@ cdef _sequence_to_array(object sequence, object mask, object size,
4641
return pyarrow_wrap_chunked_array(out)
4742

4843

49-
cdef _is_array_like(obj):
50-
try:
51-
import pandas
52-
return isinstance(obj, (np.ndarray, pd.Series, pd.Index, Categorical))
53-
except ImportError:
54-
return isinstance(obj, np.ndarray)
44+
cdef inline _is_array_like(obj):
45+
if isinstance(obj, np.ndarray):
46+
return True
47+
return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
5548

5649

5750
def _ndarray_to_arrow_type(object values, DataType type):
@@ -163,15 +156,15 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
163156

164157
values = get_series_values(obj)
165158

166-
if isinstance(values, Categorical):
159+
if pandas_api.is_categorical(values):
167160
return DictionaryArray.from_arrays(
168161
values.codes, values.categories.values,
169162
mask=mask, ordered=values.ordered,
170163
from_pandas=True, safe=safe,
171164
memory_pool=memory_pool)
172165
else:
173-
if HAVE_PANDAS:
174-
values, type = pdcompat.get_datetimetz_type(
166+
if pandas_api.have_pandas:
167+
values, type = pandas_api.compat.get_datetimetz_type(
175168
values, obj.dtype, type)
176169
return _ndarray_to_array(values, mask, type, from_pandas, safe,
177170
pool)
@@ -852,9 +845,10 @@ cdef wrap_array_output(PyObject* output):
852845
cdef object obj = PyObject_to_object(output)
853846

854847
if isinstance(obj, dict):
855-
return Categorical(obj['indices'],
856-
categories=obj['dictionary'],
857-
ordered=obj['ordered'], fastpath=True)
848+
return pandas_api.categorical_type(obj['indices'],
849+
categories=obj['dictionary'],
850+
ordered=obj['ordered'],
851+
fastpath=True)
858852
else:
859853
return obj
860854

@@ -1385,11 +1379,11 @@ cdef dict _array_classes = {
13851379

13861380

13871381
cdef object get_series_values(object obj):
1388-
if isinstance(obj, PandasSeries):
1382+
if pandas_api.is_series(obj):
13891383
result = obj.values
13901384
elif isinstance(obj, np.ndarray):
13911385
result = obj
13921386
else:
1393-
result = PandasSeries(obj).values
1387+
result = pandas_api.make_series(obj).values
13941388

13951389
return result

python/pyarrow/compat.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
# flake8: noqa
1919

20-
from distutils.version import LooseVersion
2120
import itertools
2221

2322
import numpy as np
@@ -31,38 +30,6 @@
3130
PY26 = sys.version_info[:2] == (2, 6)
3231
PY2 = sys.version_info[0] == 2
3332

34-
try:
35-
import pandas as pd
36-
pdver = LooseVersion(pd.__version__)
37-
if pdver >= '0.20.0':
38-
from pandas.api.types import DatetimeTZDtype
39-
pdapi = pd.api.types
40-
elif pdver >= '0.19.0':
41-
from pandas.types.dtypes import DatetimeTZDtype
42-
pdapi = pd.api.types
43-
else:
44-
from pandas.types.dtypes import DatetimeTZDtype
45-
pdapi = pd.core.common
46-
47-
PandasSeries = pd.Series
48-
Categorical = pd.Categorical
49-
HAVE_PANDAS = True
50-
except:
51-
HAVE_PANDAS = False
52-
class DatetimeTZDtype(object):
53-
pass
54-
55-
class ClassPlaceholder(object):
56-
57-
def __init__(self, *args, **kwargs):
58-
raise NotImplementedError
59-
60-
class PandasSeries(ClassPlaceholder):
61-
pass
62-
63-
class Categorical(ClassPlaceholder):
64-
pass
65-
6633

6734
if PY26:
6835
import unittest2 as unittest

python/pyarrow/feather.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,31 +15,25 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
from distutils.version import LooseVersion
1918
import os
2019

2120
import six
22-
import pandas as pd
2321

24-
from pyarrow.compat import pdapi
22+
from pyarrow.pandas_compat import _pandas_api # noqa
2523
from pyarrow.lib import FeatherError # noqa
2624
from pyarrow.lib import Table, concat_tables
2725
import pyarrow.lib as ext
2826

2927

30-
try:
31-
infer_dtype = pdapi.infer_dtype
32-
except AttributeError:
33-
infer_dtype = pd.lib.infer_dtype
34-
35-
36-
if LooseVersion(pd.__version__) < '0.17.0':
37-
raise ImportError("feather requires pandas >= 0.17.0")
28+
def _check_pandas_version():
29+
if _pandas_api.loose_version < '0.17.0':
30+
raise ImportError("feather requires pandas >= 0.17.0")
3831

3932

4033
class FeatherReader(ext.FeatherReader):
4134

4235
def __init__(self, source):
36+
_check_pandas_version()
4337
self.source = source
4438
self.open(source)
4539

@@ -80,12 +74,13 @@ def check_chunked_overflow(col):
8074
class FeatherWriter(object):
8175

8276
def __init__(self, dest):
77+
_check_pandas_version()
8378
self.dest = dest
8479
self.writer = ext.FeatherWriter()
8580
self.writer.open(dest)
8681

8782
def write(self, df):
88-
if isinstance(df, pd.SparseDataFrame):
83+
if isinstance(df, _pandas_api.pd.SparseDataFrame):
8984
df = df.to_dense()
9085

9186
if not df.columns.is_unique:
@@ -114,6 +109,7 @@ class FeatherDataset(object):
114109
Check that individual file schemas are all the same / compatible
115110
"""
116111
def __init__(self, path_or_paths, validate_schema=True):
112+
_check_pandas_version()
117113
self.paths = path_or_paths
118114
self.validate_schema = validate_schema
119115

python/pyarrow/lib.pyx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@
1919
# distutils: language = c++
2020
# cython: embedsignature = True
2121

22+
from collections import OrderedDict
2223
import datetime
2324
import decimal as _pydecimal
25+
import json
2426
import multiprocessing
2527
import numpy as np
2628
import os
2729
import six
28-
from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical
30+
from pyarrow.compat import frombytes, tobytes
2931

3032
from cython.operator cimport dereference as deref
3133
from pyarrow.includes.libarrow cimport *
@@ -90,6 +92,9 @@ Type_MAP = _Type_MAP
9092
UnionMode_SPARSE = _UnionMode_SPARSE
9193
UnionMode_DENSE = _UnionMode_DENSE
9294

95+
# pandas API shim
96+
include "pandas-shim.pxi"
97+
9398
# Exception types
9499
include "error.pxi"
95100

python/pyarrow/orc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
from itertools import count
1919
from numbers import Integral
2020

21-
from pyarrow import _orc
2221
from pyarrow import types
2322
from pyarrow.lib import Schema
23+
import pyarrow._orc as _orc
2424

2525

2626
def _is_map(typ):

0 commit comments

Comments
 (0)