Skip to content

Commit 408b0a5

Browse files
committed
added multik n-dim example
1 parent 70c8ee8 commit 408b0a5

File tree

2 files changed

+155
-3
lines changed

2 files changed

+155
-3
lines changed

examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/multik/compatibilityLayer.kt

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,24 @@ import org.jetbrains.kotlinx.dataframe.api.getColumns
1717
import org.jetbrains.kotlinx.dataframe.api.map
1818
import org.jetbrains.kotlinx.dataframe.api.named
1919
import org.jetbrains.kotlinx.dataframe.api.toColumn
20+
import org.jetbrains.kotlinx.dataframe.api.toColumnGroup
2021
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
22+
import org.jetbrains.kotlinx.dataframe.columns.BaseColumn
23+
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
2124
import org.jetbrains.kotlinx.multik.api.mk
2225
import org.jetbrains.kotlinx.multik.api.ndarray
2326
import org.jetbrains.kotlinx.multik.ndarray.complex.Complex
2427
import org.jetbrains.kotlinx.multik.ndarray.data.D1Array
2528
import org.jetbrains.kotlinx.multik.ndarray.data.D2Array
29+
import org.jetbrains.kotlinx.multik.ndarray.data.D3Array
30+
import org.jetbrains.kotlinx.multik.ndarray.data.MultiArray
31+
import org.jetbrains.kotlinx.multik.ndarray.data.NDArray
2632
import org.jetbrains.kotlinx.multik.ndarray.data.get
2733
import org.jetbrains.kotlinx.multik.ndarray.operations.toList
34+
import org.jetbrains.kotlinx.multik.ndarray.operations.toListD2
2835
import kotlin.experimental.ExperimentalTypeInference
2936
import kotlin.reflect.KClass
37+
import kotlin.reflect.KType
3038
import kotlin.reflect.full.isSubtypeOf
3139
import kotlin.reflect.typeOf
3240

@@ -41,6 +49,7 @@ inline fun <reified N> D1Array<N>.convertToColumn(name: String = ""): DataColumn
4149
*
4250
* @return a DataFrame where each element of the source array is represented as a row in a column named "value" under the schema [ValueProperty].
4351
*/
52+
@JvmName("convert1dArrayToDataFrame")
4453
inline fun <reified N> D1Array<N>.convertToDataFrame(): DataFrame<ValueProperty<N>> =
4554
dataFrameOf(ValueProperty<*>::value.name to column(toList()))
4655
.cast()
@@ -80,11 +89,12 @@ inline fun <T, reified N : Complex> DataFrame<T>.convertToMultik(crossinline col
8089
*
8190
* The conversion enforces that `multikArray[x][y] == dataframe[x][y]`
8291
*/
92+
@JvmName("convert2dArrayToDataFrame")
8393
inline fun <reified N> D2Array<N>.convertToDataFrame(columnNameGenerator: (Int) -> String = { "col$it" }): AnyFrame =
84-
(0..<shape[1]).map { col ->
85-
this[0..<shape[0], col]
94+
List(shape[1]) { i ->
95+
this[0..<shape[0], i] // get all cells of column i
8696
.toList()
87-
.toColumn(columnNameGenerator(col))
97+
.toColumn(columnNameGenerator(i))
8898
}.toDataFrame()
8999

90100
/**
@@ -179,3 +189,117 @@ inline fun <reified N : Complex> List<DataColumn<N>>.convertToMultik(): D2Array<
179189
mk.ndarray(toDataFrame().map { it.values() as List<N> })
180190

181191
// endregion
192+
193+
// region higher dimensions
194+
195+
/**
196+
* Converts a three-dimensional array ([D3Array]) to a DataFrame.
197+
* It will contain `shape[0]` rows and `shape[1]` columns containing lists of size `shape[2]`.
198+
*
199+
* Column names can be specified using the [columnNameGenerator] lambda.
200+
*
201+
* The conversion enforces that `multikArray[x][y][z] == dataframe[x][y][z]`
202+
*/
203+
inline fun <reified N> D3Array<N>.convertToDataFrameWithLists(
204+
columnNameGenerator: (Int) -> String = { "col$it" },
205+
): AnyFrame =
206+
List(shape[1]) { y ->
207+
this[0..<shape[0], y, 0..<shape[2]] // get all cells of column y, each is a 2d array of size shape[0] x shape[2]
208+
.toListD2() // get a shape[0]-sized list/column filled with lists of size shape[2]
209+
.toColumn(columnNameGenerator(y))
210+
}.toDataFrame()
211+
212+
/**
213+
* Converts a three-dimensional array ([D3Array]) to a DataFrame.
214+
* It will contain `shape[0]` rows and `shape[1]` column groups containing `shape[2]` columns each.
215+
*
216+
* Column names can be specified using the [columnNameGenerator] lambda.
217+
*
218+
* The conversion enforces that `multikArray[x][y][z] == dataframe[x][y][z]`
219+
*/
220+
@JvmName("convert3dArrayToDataFrame")
221+
inline fun <reified N> D3Array<N>.convertToDataFrame(columnNameGenerator: (Int) -> String = { "col$it" }): AnyFrame =
222+
List(shape[1]) { y ->
223+
this[0..<shape[0], y, 0..<shape[2]] // get all cells of column i, each is a 2d array of size shape[0] x shape[2]
224+
.transpose(1, 0) // flip, so we get shape[2] x shape[0]
225+
.toListD2() // get a shape[2]-sized list filled with lists of size shape[0]
226+
.mapIndexed { z, list ->
227+
list.toColumn(columnNameGenerator(z))
228+
} // we get shape[2] columns inside each column group
229+
.toColumnGroup(columnNameGenerator(y))
230+
}.toDataFrame()
231+
232+
/**
233+
* Exploratory recursive function to convert a [MultiArray] of any number of dimensions
234+
* to a `List<List<...>>` of the same number of dimensions.
235+
*/
236+
fun <T> MultiArray<T, *>.toListDn(): List<*> {
237+
// Recursive helper function to handle traversal across dimensions
238+
fun toListRecursive(indices: IntArray): List<*> {
239+
// If we are at the last dimension (1D case)
240+
if (indices.size == shape.lastIndex) {
241+
return List(shape[indices.size]) { i ->
242+
this[intArrayOf(*indices, i)] // Collect values for this dimension
243+
}
244+
}
245+
246+
// For higher dimensions, recursively process smaller dimensions
247+
return List(shape[indices.size]) { i ->
248+
toListRecursive(indices + i) // Add `i` to the current index array
249+
}
250+
}
251+
return toListRecursive(intArrayOf())
252+
}
253+
254+
/**
255+
* Converts a multidimensional array ([NDArray]) to a DataFrame.
256+
* Inspired by [toListDn].
257+
*
258+
* For a single-dimensional array, it will call [D1Array.convertToDataFrame].
259+
*
260+
* Column names can be specified using the [columnNameGenerator] lambda.
261+
*
262+
* The conversion enforces that `multikArray[a][b][c][d]... == dataframe[a][b][c][d]...`
263+
*/
264+
inline fun <reified N> NDArray<N, *>.convertToDataFrameNestedGroups(
265+
noinline columnNameGenerator: (Int) -> String = { "col$it" },
266+
): AnyFrame {
267+
if (shape.size == 1) return (this as D1Array<N>).convertToDataFrame()
268+
269+
// push the first dimension to the end, because this represents the rows in DataFrame,
270+
// and they are accessed by []'s first
271+
return transpose(*(1..<dim.d).toList().toIntArray(), 0)
272+
.convertToDataFrameNestedGroupsRecursive(
273+
indices = intArrayOf(),
274+
type = typeOf<N>(),
275+
columnNameGenerator = columnNameGenerator,
276+
) as ColumnGroup<*>
277+
}
278+
279+
// Recursive helper function to handle traversal across dimensions
280+
@PublishedApi
281+
internal fun NDArray<*, *>.convertToDataFrameNestedGroupsRecursive(
282+
indices: IntArray,
283+
type: KType,
284+
columnNameGenerator: (Int) -> String = { "col$it" },
285+
): BaseColumn<*> {
286+
// If we are at the last dimension (1D case)
287+
if (indices.size == shape.lastIndex) {
288+
return List(shape[indices.size]) { i ->
289+
this[intArrayOf(*indices, i)] // Collect values for this dimension
290+
}.let {
291+
DataColumn.createByType(name = "", values = it, type = type)
292+
}
293+
}
294+
295+
// For higher dimensions, recursively process smaller dimensions
296+
return List(shape[indices.size]) { i ->
297+
convertToDataFrameNestedGroupsRecursive(
298+
indices = indices + i, // Add `i` to the current index array
299+
type = type,
300+
columnNameGenerator = columnNameGenerator,
301+
).rename(columnNameGenerator(i))
302+
}.toColumnGroup("")
303+
}
304+
305+
// endregion

examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/multik/main.kt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import org.jetbrains.kotlinx.multik.ndarray.data.get
1919
fun main() {
2020
oneDimension()
2121
twoDimensions()
22+
higherDimensions()
2223
}
2324

2425
fun oneDimension() {
@@ -46,6 +47,7 @@ fun oneDimension() {
4647
fun twoDimensions() {
4748
// we can also convert a 2D ndarray to a DataFrame
4849
// This conversion will create columns like "col0", "col1", etc.
50+
// (careful, when the number of columns is too large, this can cause problems)
4951
// but will allow for similar access like in multik
5052
// aka: `multikArray[x][y] == dataframe[x][y]`
5153
val mk1 = mk.rand<Int>(5, 10)
@@ -68,3 +70,29 @@ fun twoDimensions() {
6870

6971
println(mk2)
7072
}
73+
74+
fun higherDimensions() {
75+
// Multik can store higher dimensions as well
76+
// however; to convert this to a DataFrame, we need to specify how to do a particular conversion
77+
// for instance, for 3d, we could store a list in each cell of the DF to represent the extra dimension:
78+
val mk1 = mk.rand<Int>(5, 4, 3)
79+
80+
println(mk1)
81+
82+
val df1 = mk1.convertToDataFrameWithLists()
83+
df1.print()
84+
85+
// Alternatively, this could be solved using column groups.
86+
// This subdivides each column into more columns, while ensuring `multikArray[x][y][z] == dataframe[x][y][z]`
87+
val df2 = mk1.convertToDataFrame()
88+
df2.print()
89+
90+
// For even higher dimensions, we can keep adding more column groups
91+
val mk2 = mk.rand<Int>(5, 4, 3, 2)
92+
val df3 = mk2.convertToDataFrameNestedGroups()
93+
df3.print()
94+
95+
// ...or use nested DataFrames (in FrameColumns)
96+
// (for instance, a 4D matrix could be stored in a 2D DataFrame where each cell is another DataFrame)
97+
// but, we'll leave that as an exercise for the reader :)
98+
}

0 commit comments

Comments
 (0)