mars-project
diff --git a/‎mars/_utils.pyx
Lines changed: 6 additions & 1 deletion b/‎mars/_utils.pyx
Lines changed: 6 additions & 1 deletion
diff --git a/‎mars/core.py
Lines changed: 5 additions & 0 deletions b/‎mars/core.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎mars/dataframe/core.py
Lines changed: 106 additions & 18 deletions b/‎mars/dataframe/core.py
Lines changed: 106 additions & 18 deletions
diff --git a/‎mars/dataframe/execution/arithmetic.py
Lines changed: 130 additions & 0 deletions b/‎mars/dataframe/execution/arithmetic.py
Lines changed: 130 additions & 0 deletions
@@ -157,7 +157,12 @@ cdef h_numpy(ob):
 
 
 cdef h_pandas_index(ob):
-    return h_iterative([ob.name, getattr(ob, 'names', None), ob.values])
+    if isinstance(ob, pd.RangeIndex):
+        # for range index, there is no need to get the values
+        return h_iterative([ob.name, getattr(ob, 'names', None),
+                            slice(ob._start, ob._stop, ob._step)])
+    else:
+        return h_iterative([ob.name, getattr(ob, 'names', None), ob.values])
 
 
 cdef h_pandas_series(ob):
 
@@ -394,6 +394,11 @@ def to_coarse(self):
             new_entity.params.update({'raw_chunk_size': self.nsplits})
         return new_entity
 
+    def is_sparse(self):
+        return self.op.is_sparse()
+
+    issparse = is_sparse
+
     def tiles(self):
         return handler.tiles(self)
 
 
@@ -16,32 +16,75 @@
 
 from ..core import ChunkData, Chunk, Entity, TilesableData
 from ..serialize import Serializable, ValueType, ProviderType, DataTypeField, AnyField, SeriesField, \
-    BoolField, Int64Field, Int32Field, ListField, SliceField, OneOfField, ReferenceField
+    BoolField, Int64Field, Int32Field, StringField, ListField, SliceField, OneOfField, ReferenceField
 
 
 class IndexValue(Serializable):
     __slots__ = ()
 
-    class Index(Serializable):
+    class IndexBase(Serializable):
+        _key = StringField('key')  # to identify if the index is the same
+        _is_monotonic_increasing = BoolField('is_monotonic_increasing')
+        _is_monotonic_decreasing = BoolField('is_monotonic_decreasing')
+        _is_unique = BoolField('is_unique')
+        _should_be_monotonic = BoolField('should_be_monotonic')
+        _max_val = AnyField('max_val')
+        _max_val_close = BoolField('max_val_close')
+        _min_val = AnyField('min_val')
+        _min_val_close = BoolField('min_val_close')
+
+        @property
+        def is_monotonic_increasing(self):
+            return self._is_monotonic_increasing
+
+        @property
+        def is_monotonic_decreasing(self):
+            return self._is_monotonic_decreasing
+
+        @property
+        def is_unique(self):
+            return self._is_unique
+
+        @property
+        def should_be_monotonic(self):
+            return self._should_be_monotonic
+
+        @property
+        def min_val(self):
+            return self._min_val
+
+        @property
+        def min_val_close(self):
+            return self._min_val_close
+
+        @property
+        def max_val(self):
+            return self._max_val
+
+        @property
+        def max_val_close(self):
+            return self._max_val_close
+
+    class Index(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _dtype = DataTypeField('dtype')
 
-    class RangeIndex(Serializable):
+    class RangeIndex(IndexBase):
         _name = AnyField('name')
         _slice = SliceField('slice')
 
-    class CategoricalIndex(Serializable):
+    class CategoricalIndex(IndexBase):
         _name = AnyField('name')
         _categories = ListField('categories')
         _ordered = BoolField('ordered')
 
-    class IntervalIndex(Serializable):
+    class IntervalIndex(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _closed = BoolField('closed')
 
-    class DatetimeIndex(Serializable):
+    class DatetimeIndex(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _freq = AnyField('freq')
@@ -53,7 +96,7 @@ class DatetimeIndex(Serializable):
         _dayfirst = BoolField('dayfirst')
         _yearfirst = BoolField('yearfirst')
 
-    class TimedeltaIndex(Serializable):
+    class TimedeltaIndex(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _unit = AnyField('unit')
@@ -63,7 +106,7 @@ class TimedeltaIndex(Serializable):
         _end = AnyField('end')
         _closed = AnyField('closed')
 
-    class PeriodIndex(Serializable):
+    class PeriodIndex(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _freq = AnyField('freq')
@@ -80,25 +123,24 @@ class PeriodIndex(Serializable):
         _tz = AnyField('tz')
         _dtype = DataTypeField('dtype')
 
-    class Int64Index(Serializable):
+    class Int64Index(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _dtype = DataTypeField('dtype')
 
-    class UInt64Index(Serializable):
+    class UInt64Index(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _dtype = DataTypeField('dtype')
 
-    class Float64Index(Serializable):
+    class Float64Index(IndexBase):
         _name = AnyField('name')
         _data = ListField('data')
         _dtype = DataTypeField('dtype')
 
-    class MultiIndex(Serializable):
+    class MultiIndex(IndexBase):
         _names = ListField('name')
-        _levels = ListField('levels')
-        _labels = ListField('labels')
+        _data = ListField('data')
         _sortorder = Int32Field('sortorder')
 
     _index_value = OneOfField('index_value', index=Index,
@@ -113,6 +155,42 @@ def __mars_tokenize__(self):
         v = self._index_value
         return [type(v).__name__] + [getattr(v, f, None) for f in v.__slots__]
 
+    @property
+    def value(self):
+        return self._index_value
+
+    @property
+    def is_monotonic_increasing(self):
+        return self._index_value.is_monotonic_increasing
+
+    @property
+    def is_monotonic_decreasing(self):
+        return self._index_value.is_monotonic_decreasing
+
+    @property
+    def is_monotonic_increasing_or_decreasing(self):
+        return self.is_monotonic_increasing or self.is_monotonic_decreasing
+
+    @property
+    def is_unique(self):
+        return self._index_value.is_unique
+
+    @property
+    def min_val(self):
+        return self._index_value.min_val
+
+    @property
+    def min_val_close(self):
+        return self._index_value.min_val_close
+
+    @property
+    def max_val(self):
+        return self._index_value.max_val
+
+    @property
+    def max_val_close(self):
+        return self._index_value.max_val_close
+
 
 class IndexChunkData(ChunkData):
     __slots__ = ()
@@ -224,18 +302,22 @@ class DataFrameChunkData(ChunkData):
     # optional field
     _dtypes = SeriesField('dtypes')
     _index_value = ReferenceField('index_value', IndexValue)
+    _columns_value = ReferenceField('columns_value', IndexValue)
 
     @property
     def dtypes(self):
-        return getattr(self, '_dtypes', None) or getattr(self.op, 'dtypes', None)
+        dt = getattr(self, '_dtypes', None)
+        if dt is not None:
+            return dt
+        return getattr(self.op, 'dtypes', None)
 
     @property
     def index_value(self):
         return self._index_value
 
     @property
     def columns(self):
-        return self._columns
+        return self._columns_value
 
 
 class DataFrameChunk(Chunk):
@@ -249,27 +331,33 @@ class DataFrameData(TilesableData):
     # optional field
     _dtypes = SeriesField('dtypes')
     _index_value = ReferenceField('index_value', IndexValue)
+    _columns_value = ReferenceField('columns_value', IndexValue)
     _chunks = ListField('chunks', ValueType.reference(DataFrameChunkData),
                         on_serialize=lambda x: [it.data for it in x] if x is not None else x,
                         on_deserialize=lambda x: [DataFrameChunk(it) for it in x] if x is not None else x)
 
     @property
     def dtypes(self):
-        return getattr(self, '_dtypes', None) or getattr(self.op, 'dtypes', None)
+        dt = getattr(self, '_dtypes', None)
+        if dt is not None:
+            return dt
+        return getattr(self.op, 'dtypes', None)
 
     @property
     def index_value(self):
         return self._index_value
 
     @property
     def columns(self):
-        return self._columns
+        return self._columns_value
 
 
 class DataFrame(Entity):
     __slots__ = ()
     _allow_data_type_ = (DataFrameData,)
 
 
+INDEX_TYPE = (Index, IndexData)
+SERIES_TYPE = (Series, SeriesData)
 DATAFRAME_TYPE = (DataFrame, DataFrameData)
 CHUNK_TYPE = (DataFrameChunk, DataFrameChunkData)
@@ -0,0 +1,130 @@
+# Copyright 1999-2018 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+import itertools
+import hashlib
+import functools
+
+try:
+    import pandas as pd
+except ImportError:  # pragma: no cover
+    pass
+
+from ..expressions.arithmetic.core import DataFrameIndexAlignMap, DataFrameIndexAlignReduce
+from ..expressions.arithmetic import DataFrameAdd
+
+
+def _hash(index, size):
+    def func(x, size):
+        return int(hashlib.md5(bytes(x)).hexdigest(), 16) % size
+
+    f = functools.partial(func, size=size)
+    grouped = sorted(index.groupby(index.map(f)).items(),
+                     key=operator.itemgetter(0))
+    return [g[1] for g in grouped]
+
+
+def _index_align_map(ctx, chunk):
+    # TODO(QIN): add GPU support here
+    df = ctx[chunk.inputs[0].key]
+
+    filters = [[], []]
+
+    if chunk.op.index_shuffle_size is None:
+        # no shuffle on index
+        op = operator.ge if chunk.op.index_min_close else operator.gt
+        index_cond = op(df.index, chunk.op.index_min)
+        op = operator.le if chunk.op.index_max_close else operator.lt
+        index_cond = index_cond & op(df.index, chunk.op.index_max)
+        filters[0].append(index_cond)
+    else:
+        # shuffle on index
+        shuffle_size = chunk.op.index_shuffle_size
+        filters[0].extend(_hash(df.index, shuffle_size))
+
+    if chunk.op.column_shuffle_size is None:
+        # no shuffle on columns
+        op = operator.ge if chunk.op.column_min_close else operator.gt
+        columns_cond = op(df.columns, chunk.op.column_min)
+        op = operator.le if chunk.op.column_max_close else operator.ge
+        columns_cond = columns_cond & op(df.columns, chunk.op.column_max)
+        filters[1].append(columns_cond)
+    else:
+        # shuffle on columns
+        shuffle_size = chunk.op.column_shuffle_size
+        filters[1].extend(_hash(df.columns, shuffle_size))
+
+    if all(len(it) == 1 for it in filters):
+        # no shuffle
+        ctx[chunk.key] = df.loc[filters[0][0], filters[1][0]]
+        return
+    elif len(filters[0]) == 1:
+        # shuffle on columns
+        for column_idx, column_filter in enumerate(filters[1]):
+            group_key = ','.join([str(chunk.index[0]), str(column_idx)])
+            ctx[(chunk.key, group_key)] = df.loc[filters[0][0], column_filter]
+    elif len(filters[1]) == 1:
+        # shuffle on index
+        for index_idx, index_filter in enumerate(filters[0]):
+            group_key = ','.join([str(index_idx), str(chunk.index[1])])
+            ctx[(chunk.key, group_key)] = df.loc[index_filter, filters[1][0]]
+    else:
+        # full shuffle
+        shuffle_index_size = chunk.op.index_shuffle_size
+        shuffle_column_size = chunk.op.column_shuffle_size
+        out_idxes = itertools.product(range(shuffle_index_size), range(shuffle_column_size))
+        out_index_columns = itertools.product(*filters)
+        for out_idx, out_index_column in zip(out_idxes, out_index_columns):
+            index_filter, column_filter = out_index_column
+            group_key = ','.join(str(i) for i in out_idx)
+            ctx[(chunk.key, group_key)] = df.loc[index_filter, column_filter]
+
+
+def _index_align_reduce(ctx, chunk):
+    input_idx_to_df = {inp.index: ctx[inp.key, ','.join(str(idx) for idx in chunk.index)]
+                       for inp in chunk.inputs[0].inputs}
+    row_idxes = sorted({idx[0] for idx in input_idx_to_df})
+    col_idxes = sorted({idx[1] for idx in input_idx_to_df})
+
+    res = None
+    for row_idx in row_idxes:
+        row_df = None
+        for col_idx in col_idxes:
+            df = input_idx_to_df[row_idx, col_idx]
+            if row_df is None:
+                row_df = df
+            else:
+                row_df = pd.concat([row_df, df], axis=1)
+
+        if res is None:
+            res = row_df
+        else:
+            res = pd.concat([res, row_df], axis=0)
+
+    ctx[chunk.key] = res
+
+
+def _add(ctx, chunk):
+    left, right = ctx[chunk.inputs[0].key], ctx[chunk.inputs[1].key]
+    ctx[chunk.key] = left.add(right, axis=chunk.op.axis,
+                              level=chunk.op.level, fill_value=chunk.op.fill_value)
+
+
+def register_arithmetic_handler():
+    from ...executor import register
+
+    register(DataFrameIndexAlignMap, _index_align_map)
+    register(DataFrameIndexAlignReduce, _index_align_reduce)
+    register(DataFrameAdd, _add)