diff --git a/eng/Versions.props b/eng/Versions.props index 902cc3dab3..d13d42aadc 100644 --- a/eng/Versions.props +++ b/eng/Versions.props @@ -30,7 +30,7 @@ 6.0.1 4.7.1 - 2.0.0 + 11.0.0 3.19.6 2.3.1 3.3.0 diff --git a/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs b/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs index 2938413459..270cfff63b 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs @@ -101,10 +101,18 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray AppendDataFrameColumnFromArrowArray(fieldsEnumerator.Current, structArrayEnumerator.Current, ret, field.Name + "_"); } break; - case ArrowTypeId.Decimal: + case ArrowTypeId.Date64: + Date64Array arrowDate64Array = (Date64Array)arrowArray; + dataFrameColumn = new PrimitiveDataFrameColumn(fieldName, arrowDate64Array.Data.Length); + for (int i = 0; i < arrowDate64Array.Data.Length; i++) + { + dataFrameColumn[i] = arrowDate64Array.GetDateTime(i); + } + break; + case ArrowTypeId.Decimal128: + case ArrowTypeId.Decimal256: case ArrowTypeId.Binary: case ArrowTypeId.Date32: - case ArrowTypeId.Date64: case ArrowTypeId.Dictionary: case ArrowTypeId.FixedSizedBinary: case ArrowTypeId.HalfFloat: @@ -114,6 +122,7 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray case ArrowTypeId.Null: case ArrowTypeId.Time32: case ArrowTypeId.Time64: + case ArrowTypeId.Timestamp: default: throw new NotImplementedException($"{fieldType.Name}"); } @@ -145,7 +154,7 @@ public static DataFrame FromArrowRecordBatch(RecordBatch recordBatch) } /// - /// Returns an without copying data + /// Returns an mostly without copying data /// public IEnumerable ToArrowRecordBatches() { diff --git a/src/Microsoft.Data.Analysis/DataFrame.Join.cs b/src/Microsoft.Data.Analysis/DataFrame.Join.cs index 2109573c38..8e4029ffd3 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.Join.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.Join.cs @@ -30,7 +30,7 @@ private void SetSuffixForDuplicatedColumnNames(DataFrame dataFrame, DataFrameCol { // Pre-existing column. Change name DataFrameColumn existingColumn = dataFrame.Columns[index]; - dataFrame._columnCollection.SetColumnName(existingColumn, existingColumn.Name + leftSuffix); + existingColumn.SetName(existingColumn.Name + leftSuffix); column.SetName(column.Name + rightSuffix); index = dataFrame._columnCollection.IndexOf(column.Name); } diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 20d42bb9f7..25a8cbbfbc 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -301,7 +301,7 @@ public DataFrame AddPrefix(string prefix, bool inPlace = false) for (int i = 0; i < df.Columns.Count; i++) { DataFrameColumn column = df.Columns[i]; - df._columnCollection.SetColumnName(column, prefix + column.Name); + column.SetName(prefix + column.Name); df.OnColumnsChanged(); } return df; @@ -316,7 +316,7 @@ public DataFrame AddSuffix(string suffix, bool inPlace = false) for (int i = 0; i < df.Columns.Count; i++) { DataFrameColumn column = df.Columns[i]; - df._columnCollection.SetColumnName(column, column.Name + suffix); + column.SetName(column.Name + suffix); df.OnColumnsChanged(); } return df; diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index 3a2f97f817..1b55b92d8f 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -84,6 +84,26 @@ protected set } } + // List of ColumnCollections that owns the column + // Current API allows column to be added into multiple dataframes, that's why the list is needed + private readonly List _ownerColumnCollections = new(); + + internal void AddOwner(DataFrameColumnCollection columCollection) + { + if (!_ownerColumnCollections.Contains(columCollection)) + { + _ownerColumnCollections.Add(columCollection); + } + } + + internal void RemoveOwner(DataFrameColumnCollection columCollection) + { + if (_ownerColumnCollections.Contains(columCollection)) + { + _ownerColumnCollections.Remove(columCollection); + } + } + /// /// The number of values in this column. /// @@ -95,24 +115,30 @@ public abstract long NullCount private string _name; /// - /// The name of this column. + /// The column name. /// public string Name => _name; /// - /// Updates the name of this column. + /// Updates the column name. /// /// The new name. - /// If passed in, update the column name in - public void SetName(string newName, DataFrame dataFrame = null) + public void SetName(string newName) { - if (!(dataFrame is null)) - { - dataFrame.Columns.SetColumnName(this, newName); - } + foreach (var owner in _ownerColumnCollections) + owner.UpdateColumnNameMetadata(this, newName); + _name = newName; } + /// + /// Updates the name of this column. + /// + /// The new name. + /// Ignored (for backward compatibility) + [Obsolete] + public void SetName(string newName, DataFrame dataFrame) => SetName(newName); + /// /// The type of data this column holds. /// diff --git a/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs b/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs index 1fae1168c7..13c363660c 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -38,11 +38,23 @@ internal IReadOnlyList GetColumnNames() return ret; } + public void RenameColumn(string currentName, string newName) + { + var column = this[currentName]; + column.SetName(newName); + } + + [Obsolete] public void SetColumnName(DataFrameColumn column, string newName) + { + column.SetName(newName); + } + + //Updates column's metadata (is used as a callback from Column class) + internal void UpdateColumnNameMetadata(DataFrameColumn column, string newName) { string currentName = column.Name; int currentIndex = _columnNameToIndexDictionary[currentName]; - column.SetName(newName); _columnNameToIndexDictionary.Remove(currentName); _columnNameToIndexDictionary.Add(newName, currentIndex); ColumnsChanged?.Invoke(); @@ -66,7 +78,7 @@ protected override void InsertItem(int columnIndex, DataFrameColumn column) } else if (column.Length != RowCount) { - //check all columns in the dataframe have the same length (amount of rows) + //check all columns in the dataframe have the same lenght (amount of rows) throw new ArgumentException(Strings.MismatchedColumnLengths, nameof(column)); } @@ -75,7 +87,7 @@ protected override void InsertItem(int columnIndex, DataFrameColumn column) throw new ArgumentException(string.Format(Strings.DuplicateColumnName, column.Name), nameof(column)); } - RowCount = column.Length; + column.AddOwner(this); _columnNameToIndexDictionary[column.Name] = columnIndex; for (int i = columnIndex + 1; i < Count; i++) @@ -100,7 +112,10 @@ protected override void SetItem(int columnIndex, DataFrameColumn column) } _columnNameToIndexDictionary.Remove(this[columnIndex].Name); _columnNameToIndexDictionary[column.Name] = columnIndex; + + this[columnIndex].RemoveOwner(this); base.SetItem(columnIndex, column); + ColumnsChanged?.Invoke(); } @@ -111,6 +126,8 @@ protected override void RemoveItem(int columnIndex) { _columnNameToIndexDictionary[this[i].Name]--; } + + this[columnIndex].RemoveOwner(this); base.RemoveItem(columnIndex); //Reset RowCount if the last column was removed and dataframe is empty @@ -204,10 +221,10 @@ public PrimitiveDataFrameColumn GetPrimitiveColumn(string name) } /// - /// Gets the with the specified . + /// Gets the with the specified . /// /// The name of the column - /// . + /// . /// A column named cannot be found, or if the column's type doesn't match. public PrimitiveDataFrameColumn GetDateTimeColumn(string name) { diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 92996b136b..d65255d5be 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -374,18 +374,6 @@ internal int MaxRecordBatchLength(long startIndex) return Buffers[arrayIndex].Length - (int)startIndex; } - internal ReadOnlyMemory GetValueBuffer(long startIndex) - { - int arrayIndex = GetArrayContainingRowIndex(startIndex); - return Buffers[arrayIndex].ReadOnlyBuffer; - } - - internal ReadOnlyMemory GetNullBuffer(long startIndex) - { - int arrayIndex = GetArrayContainingRowIndex(startIndex); - return NullBitMapBuffers[arrayIndex].ReadOnlyBuffer; - } - public IReadOnlyList this[long startIndex, int length] { get diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index 152a6247dc..0fe7820fe2 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -7,6 +7,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using Apache.Arrow; using Apache.Arrow.Types; using Microsoft.ML; @@ -104,6 +105,8 @@ private IArrowType GetArrowType() return UInt64Type.Default; else if (typeof(T) == typeof(ushort)) return UInt16Type.Default; + else if (typeof(T) == typeof(DateTime)) + return Date64Type.Default; else throw new NotImplementedException(nameof(T)); } @@ -127,36 +130,64 @@ protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int { int arrayIndex = numberOfRows == 0 ? 0 : _columnContainer.GetArrayContainingRowIndex(startIndex); int offset = (int)(startIndex - arrayIndex * ReadOnlyDataFrameBuffer.MaxCapacity); + if (numberOfRows != 0 && numberOfRows > _columnContainer.Buffers[arrayIndex].Length - offset) { throw new ArgumentException(Strings.SpansMultipleBuffers, nameof(numberOfRows)); } - ArrowBuffer valueBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.GetValueBuffer(startIndex)); - ArrowBuffer nullBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.GetNullBuffer(startIndex)); + int nullCount = GetNullCount(startIndex, numberOfRows); + + //DateTime requires convertion + if (this.DataType == typeof(DateTime)) + { + if (numberOfRows == 0) + return new Date64Array(ArrowBuffer.Empty, ArrowBuffer.Empty, numberOfRows, nullCount, offset); + + ReadOnlyDataFrameBuffer valueBuffer = (numberOfRows == 0) ? null : _columnContainer.Buffers[arrayIndex]; + ReadOnlyDataFrameBuffer nullBuffer = (numberOfRows == 0) ? null : _columnContainer.NullBitMapBuffers[arrayIndex]; + + ReadOnlySpan valueSpan = MemoryMarshal.Cast(valueBuffer.ReadOnlySpan); + Date64Array.Builder builder = new Date64Array.Builder().Reserve(valueBuffer.Length); + + for (int i = 0; i < valueBuffer.Length; i++) + { + if (BitUtility.GetBit(nullBuffer.ReadOnlySpan, i)) + builder.Append(valueSpan[i]); + else + builder.AppendNull(); + } + + return builder.Build(); + } + + //No convertion + ArrowBuffer arrowValueBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.Buffers[arrayIndex].ReadOnlyBuffer); + ArrowBuffer arrowNullBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.NullBitMapBuffers[arrayIndex].ReadOnlyBuffer); + Type type = this.DataType; if (type == typeof(bool)) - return new BooleanArray(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new BooleanArray(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(double)) - return new DoubleArray(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new DoubleArray(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(float)) - return new FloatArray(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new FloatArray(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(int)) - return new Int32Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new Int32Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(long)) - return new Int64Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new Int64Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(sbyte)) - return new Int8Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new Int8Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(short)) - return new Int16Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new Int16Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(uint)) - return new UInt32Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new UInt32Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(ulong)) - return new UInt64Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new UInt64Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(ushort)) - return new UInt16Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new UInt16Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else if (type == typeof(byte)) - return new UInt8Array(valueBuffer, nullBuffer, numberOfRows, nullCount, offset); + return new UInt8Array(arrowValueBuffer, arrowNullBuffer, numberOfRows, nullCount, offset); else throw new NotImplementedException(type.ToString()); } diff --git a/test/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs b/test/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs index dacf43a8db..185ab835bb 100644 --- a/test/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs @@ -48,6 +48,7 @@ public void TestArrowIntegration() .Append("ULongColumn", false, new UInt64Array.Builder().AppendRange(Enumerable.Repeat((ulong)1, 10)).Build()) .Append("ByteColumn", false, new Int8Array.Builder().AppendRange(Enumerable.Repeat((sbyte)1, 10)).Build()) .Append("UByteColumn", false, new UInt8Array.Builder().AppendRange(Enumerable.Repeat((byte)1, 10)).Build()) + .Append("Date64Column", false, new Date64Array.Builder().AppendRange(Enumerable.Repeat(DateTime.Now, 10)).Build()) .Build(); DataFrame df = DataFrame.FromArrowRecordBatch(originalBatch); diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 086f5101b2..ff7856e984 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -388,6 +388,44 @@ public void ClearColumnsTests() Assert.Equal(0, dataFrame.Columns.LongCount()); } + [Fact] + public void RenameColumnWithSetNameTests() + { + StringDataFrameColumn city = new StringDataFrameColumn("City", new string[] { "London", "Berlin" }); + PrimitiveDataFrameColumn temp = new PrimitiveDataFrameColumn("Temperature", new int[] { 12, 13 }); + + DataFrame dataframe = new DataFrame(city, temp); + + // Change the name of the column: + dataframe["City"].SetName("Town"); + var renamedColumn = dataframe["Town"]; + + Assert.Throws(() => dataframe["City"]); + + Assert.NotNull(renamedColumn); + Assert.Equal("Town", renamedColumn.Name); + Assert.True(ReferenceEquals(city, renamedColumn)); + } + + [Fact] + public void RenameColumnWithRenameColumnTests() + { + StringDataFrameColumn city = new StringDataFrameColumn("City", new string[] { "London", "Berlin" }); + PrimitiveDataFrameColumn temp = new PrimitiveDataFrameColumn("Temperature", new int[] { 12, 13 }); + + DataFrame dataframe = new DataFrame(city, temp); + + // Change the name of the column: + dataframe.Columns.RenameColumn("City", "Town"); + var renamedColumn = dataframe["Town"]; + + Assert.Throws(() => dataframe["City"]); + + Assert.NotNull(renamedColumn); + Assert.Equal("Town", renamedColumn.Name); + Assert.True(ReferenceEquals(city, renamedColumn)); + } + [Fact] public void TestBinaryOperations() { diff --git a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj index cb9461bb50..c1dd6a4f0c 100644 --- a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj +++ b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj @@ -9,7 +9,6 @@ -