add ability to slice data

nevi-me · nevi-me · commit a438661fab9b · 2019-04-19T08:35:14.000+02:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Neville Dipale <nevilledips@gmail.com>"]
 edition = "2018"
 
 [dependencies]
-arrow = { git = "https://github.com/nevi-me/arrow", rev = "cb38e68"}
+arrow = { git = "https://github.com/apache/arrow"}
 # arrow = { path = "../../arrow/rust/arrow"}
 num = "0.2"
 num-traits = "0.2"
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -22,7 +22,6 @@ pub struct DataFrame {
     columns: Vec<Column>,
 }
 
-
 impl DataFrame {
     /// Create an empty `DataFrame`
     fn empty() -> Self {
@@ -143,31 +142,16 @@ impl DataFrame {
     }
 
     /// Returns dataframe with the first n records selected
-    ///
-    /// TODO: this should work through batches, and slice the last one that makes
-    /// the length match what we're taking.
-    // fn take(&self, count: usize) -> Self {
-    //     DataFrame::new(
-    //         self.schema.clone(),
-    //         self.columns
-    //             .into_iter()
-    //             .map(|col| {
-    //                 ArrayDataBuilder::new(col.data_type().clone())
-    //                     .child_data(
-    //                         col.data()
-    //                             .child_data()
-    //                             .iter()
-    //                             .take(count)
-    //                             .into_iter()
-    //                             .map(|x| x.clone())
-    //                             .collect(),
-    //                     )
-    //                     .build()
-    //             })
-    //             .map(|col| utils::make_array(col))
-    //             .collect(),
-    //     )
-    // }
+    fn take(&self, count: usize) -> Self {
+        DataFrame::new(
+            self.schema.clone(),
+            self.columns
+                .clone()
+                .into_iter()
+                .map(|col| col.slice(0, Some(count)))
+                .collect(),
+        )
+    }
 
     fn intersect(&self, other: &DataFrame) -> Self {
         unimplemented!("Intersect not yet implemented")
diff --git a/src/table.rs b/src/table.rs
@@ -10,7 +10,6 @@ pub struct ChunkedArray {
     chunks: Vec<Arc<Array>>,
     num_rows: usize,
     null_count: usize,
-    // TODO: Go has data_type, is it worth storing, or getting from the first chunk?
 }
 
 impl ChunkedArray {
@@ -40,7 +39,7 @@ impl ChunkedArray {
         self.num_rows
     }
 
-    fn null_count(&self) -> usize {
+    pub fn null_count(&self) -> usize {
         self.null_count
     }
 
@@ -63,29 +62,24 @@ impl ChunkedArray {
     /// The `offset` is the position of the first element in the constructed slice.
     /// `length` is the length of the slice. If there are not enough elements in the chunked array,
     /// the length will be adjusted accordingly.
-    ///
-    /// TODO: I've made length optional because CPP has 2 `slice` methods, with one being a slice
-    /// to the end of the array.
-    ///
-    /// TODO: This relies on my version of slice, which I'm still implementing.
-    // fn slice(&self, offset: usize, length: Option<usize>) -> Self {
-    //     // unimplemented!("TODO: I need help here, this has to be a zero-copy slice among slices")
-    //     let mut offset = offset;
-    //     let mut length = length.unwrap_or(std::usize::MAX);
-    //     let mut current_chunk: usize = 0;
-    //     let mut new_chunks: Vec<ArrayRef> = vec![];
-    //     while current_chunk < self.num_chunks() && offset >= self.chunk(current_chunk).len() {
-    //         offset -= self.chunk(current_chunk).len();
-    //         current_chunk += 1;
-    //     }
-    //     while current_chunk < self.num_chunks() && length > 0 {
-    //         new_chunks.push(self.chunk(current_chunk).slice(offset, length));
-    //         length -= self.chunk(current_chunk).len() - offset;
-    //         offset = 0;
-    //         current_chunk += 1;
-    //     }
-    //     Self::from_arrays(new_chunks)
-    // }
+    fn slice(&self, offset: usize, length: Option<usize>) -> Self {
+        let mut offset = offset;
+        let mut length = length.unwrap_or(std::usize::MAX);
+        let mut current_chunk: usize = 0;
+        let mut new_chunks: Vec<ArrayRef> = vec![];
+        // compute the first offset. If offset > whole chunks' lengths, skip those chunks
+        while current_chunk < self.num_chunks() && offset >= self.chunk(current_chunk).len() {
+            offset -= self.chunk(current_chunk).len();
+            current_chunk += 1;
+        }
+        while current_chunk < self.num_chunks() && length > 0 {
+            new_chunks.push(self.chunk(current_chunk).slice(offset, length));
+            length -= self.chunk(current_chunk).len() - offset;
+            offset = 0;
+            current_chunk += 1;
+        }
+        Self::from_arrays(new_chunks)
+    }
 
     fn flatten(&self) {
         unimplemented!("This is for flattening struct columns, we aren't yet there")
@@ -165,8 +159,17 @@ impl Column {
         &self.field
     }
 
-    /// TODO: slice seems the same as that of `ChunkedArray`
-    // fn slice(&self, offset: usize, length: usize) -> Self {}
+    pub fn slice(&self, offset: usize, length: Option<usize>) -> Self {
+        Self::from_chunked_array(self.data().slice(offset, length), self.field().clone())
+    }
+
+    pub fn null_count(&self) -> usize {
+        self.data().null_count()
+    }
+
+    pub fn num_rows(&self) -> usize {
+        self.data().num_rows()
+    }
 
     fn flatten() {}
 }
@@ -178,23 +181,23 @@ pub struct Table {
 }
 
 impl Table {
-    // pub fn new(schema: Arc<Schema>, columns: Vec<Column>) -> Self {
-    //     // assert that there are some columns
-    //     assert!(
-    //         columns.len() > 0,
-    //         "at least one column must be defined to create a record batch"
-    //     );
-    //     // assert that all columns have the same row count
-    //     let len = columns[0].data().len();
-    //     for i in 1..columns.len() {
-    //         assert_eq!(
-    //             len,
-    //             columns[i].len(),
-    //             "all columns in a record batch must have the same length"
-    //         );
-    //     }
-    //     Table { schema, columns }
-    // }
+    pub fn new(schema: Arc<Schema>, columns: Vec<Column>) -> Self {
+        // assert that there are some columns
+        assert!(
+            columns.len() > 0,
+            "at least one column must be defined to create a record batch"
+        );
+        // assert that all columns have the same row count
+        let len = columns[0].data().num_rows();
+        for i in 1..columns.len() {
+            assert_eq!(
+                len,
+                columns[i].data().num_rows(),
+                "all columns in a record batch must have the same length"
+            );
+        }
+        Table { schema, columns }
+    }
 
     pub fn schema(&self) -> &Arc<Schema> {
         &self.schema
@@ -251,6 +254,19 @@ impl Table {
         Table { schema, columns }
     }
 
+    /// Slice the table from an offset
+    pub fn slice(&self, offset: usize, limit: usize) -> Self {
+        Table {
+            schema: self.schema.clone(),
+            columns: self
+                .columns
+                .clone()
+                .into_iter()
+                .map(|col| col.slice(offset, Some(limit)))
+                .collect(),
+        }
+    }
+
     /// Construct a `Table` from a sequence of Arrow `RecordBatch`es.
     ///
     /// Columns are first created from the `RecordBatch`es, with schema validations being performed.