Skip to content
This repository was archived by the owner on Dec 29, 2021. It is now read-only.

Commit 7e0d187

Browse files
committed
implement dataframe::to_record_batches
1 parent 5745219 commit 7e0d187

File tree

1 file changed

+39
-5
lines changed

1 file changed

+39
-5
lines changed

src/dataframe.rs

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use arrow::array_data::ArrayDataRef;
66
use arrow::csv::Reader as CsvReader;
77
use arrow::csv::ReaderBuilder as CsvReaderBuilder;
88
use arrow::datatypes::*;
9+
use arrow::error::ArrowError;
910
use arrow::record_batch::RecordBatch;
1011
use std::fs::File;
1112
use std::sync::Arc;
@@ -112,6 +113,10 @@ impl DataFrame {
112113
self.columns.len()
113114
}
114115

116+
pub fn num_chunks(&self) -> usize {
117+
self.column(0).data.num_chunks()
118+
}
119+
115120
pub fn num_rows(&self) -> usize {
116121
self.columns[0].data.num_rows()
117122
}
@@ -155,12 +160,41 @@ impl DataFrame {
155160
/// Returns dataframe as an Arrow `RecordBatch`
156161
/// TODO: add a method to break into smaller batches
157162
fn to_record_batches(&self) -> Vec<RecordBatch> {
158-
let batches: Vec<RecordBatch> = Vec::with_capacity(self.column(0).data().num_chunks());
159-
for i in 0..self.num_columns() {
160-
unimplemented!("We currently do not get batches, this should live in dataframe")
163+
let num_chunks = self.column(0).data().num_chunks();
164+
let num_columns = self.num_columns();
165+
let mut batches: Vec<RecordBatch> = Vec::with_capacity(num_chunks);
166+
let mut arrays: Vec<Vec<ArrayRef>> = Vec::with_capacity(num_chunks);
167+
// for i in 0..self.num_columns() {
168+
// let column = self.column(i);
169+
// if i == 0 {
170+
// arrays.push(vec![]);
171+
// }
172+
// for j in 0..column.data().num_chunks() {
173+
// arrays[i].push(column.data().chunk(j).to_owned());
174+
// }
175+
// }
176+
177+
for i in 0..num_chunks {
178+
let mut arr = vec![];
179+
180+
// if i == 0 {
181+
// arrays.push(vec![]);
182+
// }
183+
for j in 0..num_columns {
184+
let column = self.column(j);
185+
arr.push(column.data().chunk(i).to_owned());
186+
}
187+
188+
arrays.push(arr);
189+
dbg!("pushed array");
161190
}
191+
192+
arrays.into_iter().for_each(|array| {
193+
dbg!(array.len());
194+
batches.push(RecordBatch::new(self.schema.clone(), array));
195+
});
196+
162197
batches
163-
// RecordBatch::new(self.schema.clone(), self.columns)
164198
}
165199

166200
/// Returns dataframe with the first n records selected
@@ -300,7 +334,7 @@ impl DataFrame {
300334
let builder = CsvReaderBuilder::new()
301335
.infer_schema(None)
302336
.has_headers(true)
303-
.with_batch_size(6);
337+
.with_batch_size(1024);
304338
builder.build(file).unwrap()
305339
}
306340
};

0 commit comments

Comments
 (0)