Skip to content
This repository was archived by the owner on Dec 29, 2021. It is now read-only.

Commit a438661

Browse files
committed
add ability to slice data
1 parent 5a495b2 commit a438661

File tree

3 files changed

+71
-71
lines changed

3 files changed

+71
-71
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ authors = ["Neville Dipale <nevilledips@gmail.com>"]
55
edition = "2018"
66

77
[dependencies]
8-
arrow = { git = "https://github.com/nevi-me/arrow", rev = "cb38e68"}
8+
arrow = { git = "https://github.com/apache/arrow"}
99
# arrow = { path = "../../arrow/rust/arrow"}
1010
num = "0.2"
1111
num-traits = "0.2"

src/dataframe.rs

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ pub struct DataFrame {
2222
columns: Vec<Column>,
2323
}
2424

25-
2625
impl DataFrame {
2726
/// Create an empty `DataFrame`
2827
fn empty() -> Self {
@@ -143,31 +142,16 @@ impl DataFrame {
143142
}
144143

145144
/// Returns dataframe with the first n records selected
146-
///
147-
/// TODO: this should work through batches, and slice the last one that makes
148-
/// the length match what we're taking.
149-
// fn take(&self, count: usize) -> Self {
150-
// DataFrame::new(
151-
// self.schema.clone(),
152-
// self.columns
153-
// .into_iter()
154-
// .map(|col| {
155-
// ArrayDataBuilder::new(col.data_type().clone())
156-
// .child_data(
157-
// col.data()
158-
// .child_data()
159-
// .iter()
160-
// .take(count)
161-
// .into_iter()
162-
// .map(|x| x.clone())
163-
// .collect(),
164-
// )
165-
// .build()
166-
// })
167-
// .map(|col| utils::make_array(col))
168-
// .collect(),
169-
// )
170-
// }
145+
fn take(&self, count: usize) -> Self {
146+
DataFrame::new(
147+
self.schema.clone(),
148+
self.columns
149+
.clone()
150+
.into_iter()
151+
.map(|col| col.slice(0, Some(count)))
152+
.collect(),
153+
)
154+
}
171155

172156
fn intersect(&self, other: &DataFrame) -> Self {
173157
unimplemented!("Intersect not yet implemented")

src/table.rs

Lines changed: 60 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ pub struct ChunkedArray {
1010
chunks: Vec<Arc<Array>>,
1111
num_rows: usize,
1212
null_count: usize,
13-
// TODO: Go has data_type, is it worth storing, or getting from the first chunk?
1413
}
1514

1615
impl ChunkedArray {
@@ -40,7 +39,7 @@ impl ChunkedArray {
4039
self.num_rows
4140
}
4241

43-
fn null_count(&self) -> usize {
42+
pub fn null_count(&self) -> usize {
4443
self.null_count
4544
}
4645

@@ -63,29 +62,24 @@ impl ChunkedArray {
6362
/// The `offset` is the position of the first element in the constructed slice.
6463
/// `length` is the length of the slice. If there are not enough elements in the chunked array,
6564
/// the length will be adjusted accordingly.
66-
///
67-
/// TODO: I've made length optional because CPP has 2 `slice` methods, with one being a slice
68-
/// to the end of the array.
69-
///
70-
/// TODO: This relies on my version of slice, which I'm still implementing.
71-
// fn slice(&self, offset: usize, length: Option<usize>) -> Self {
72-
// // unimplemented!("TODO: I need help here, this has to be a zero-copy slice among slices")
73-
// let mut offset = offset;
74-
// let mut length = length.unwrap_or(std::usize::MAX);
75-
// let mut current_chunk: usize = 0;
76-
// let mut new_chunks: Vec<ArrayRef> = vec![];
77-
// while current_chunk < self.num_chunks() && offset >= self.chunk(current_chunk).len() {
78-
// offset -= self.chunk(current_chunk).len();
79-
// current_chunk += 1;
80-
// }
81-
// while current_chunk < self.num_chunks() && length > 0 {
82-
// new_chunks.push(self.chunk(current_chunk).slice(offset, length));
83-
// length -= self.chunk(current_chunk).len() - offset;
84-
// offset = 0;
85-
// current_chunk += 1;
86-
// }
87-
// Self::from_arrays(new_chunks)
88-
// }
65+
fn slice(&self, offset: usize, length: Option<usize>) -> Self {
66+
let mut offset = offset;
67+
let mut length = length.unwrap_or(std::usize::MAX);
68+
let mut current_chunk: usize = 0;
69+
let mut new_chunks: Vec<ArrayRef> = vec![];
70+
// compute the first offset. If offset > whole chunks' lengths, skip those chunks
71+
while current_chunk < self.num_chunks() && offset >= self.chunk(current_chunk).len() {
72+
offset -= self.chunk(current_chunk).len();
73+
current_chunk += 1;
74+
}
75+
while current_chunk < self.num_chunks() && length > 0 {
76+
new_chunks.push(self.chunk(current_chunk).slice(offset, length));
77+
length -= self.chunk(current_chunk).len() - offset;
78+
offset = 0;
79+
current_chunk += 1;
80+
}
81+
Self::from_arrays(new_chunks)
82+
}
8983

9084
fn flatten(&self) {
9185
unimplemented!("This is for flattening struct columns, we aren't yet there")
@@ -165,8 +159,17 @@ impl Column {
165159
&self.field
166160
}
167161

168-
/// TODO: slice seems the same as that of `ChunkedArray`
169-
// fn slice(&self, offset: usize, length: usize) -> Self {}
162+
pub fn slice(&self, offset: usize, length: Option<usize>) -> Self {
163+
Self::from_chunked_array(self.data().slice(offset, length), self.field().clone())
164+
}
165+
166+
pub fn null_count(&self) -> usize {
167+
self.data().null_count()
168+
}
169+
170+
pub fn num_rows(&self) -> usize {
171+
self.data().num_rows()
172+
}
170173

171174
fn flatten() {}
172175
}
@@ -178,23 +181,23 @@ pub struct Table {
178181
}
179182

180183
impl Table {
181-
// pub fn new(schema: Arc<Schema>, columns: Vec<Column>) -> Self {
182-
// // assert that there are some columns
183-
// assert!(
184-
// columns.len() > 0,
185-
// "at least one column must be defined to create a record batch"
186-
// );
187-
// // assert that all columns have the same row count
188-
// let len = columns[0].data().len();
189-
// for i in 1..columns.len() {
190-
// assert_eq!(
191-
// len,
192-
// columns[i].len(),
193-
// "all columns in a record batch must have the same length"
194-
// );
195-
// }
196-
// Table { schema, columns }
197-
// }
184+
pub fn new(schema: Arc<Schema>, columns: Vec<Column>) -> Self {
185+
// assert that there are some columns
186+
assert!(
187+
columns.len() > 0,
188+
"at least one column must be defined to create a record batch"
189+
);
190+
// assert that all columns have the same row count
191+
let len = columns[0].data().num_rows();
192+
for i in 1..columns.len() {
193+
assert_eq!(
194+
len,
195+
columns[i].data().num_rows(),
196+
"all columns in a record batch must have the same length"
197+
);
198+
}
199+
Table { schema, columns }
200+
}
198201

199202
pub fn schema(&self) -> &Arc<Schema> {
200203
&self.schema
@@ -251,6 +254,19 @@ impl Table {
251254
Table { schema, columns }
252255
}
253256

257+
/// Slice the table from an offset
258+
pub fn slice(&self, offset: usize, limit: usize) -> Self {
259+
Table {
260+
schema: self.schema.clone(),
261+
columns: self
262+
.columns
263+
.clone()
264+
.into_iter()
265+
.map(|col| col.slice(offset, Some(limit)))
266+
.collect(),
267+
}
268+
}
269+
254270
/// Construct a `Table` from a sequence of Arrow `RecordBatch`es.
255271
///
256272
/// Columns are first created from the `RecordBatch`es, with schema validations being performed.

0 commit comments

Comments
 (0)