Merge pull request #12 from nevi-me/slices

nevi-me · web-flow · commit 02c565a22898 · 2019-04-19T08:36:23.000+02:00
Ability to slice table, column, chunked array
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Neville Dipale <nevilledips@gmail.com>"]
 edition = "2018"
 
 [dependencies]
-arrow = { git = "https://github.com/nevi-me/arrow", branch="arrow-4386"}
+arrow = { git = "https://github.com/apache/arrow"}
 # arrow = { path = "../../arrow/rust/arrow"}
 num = "0.2"
 num-traits = "0.2"
diff --git a/README.md b/README.md
@@ -37,9 +37,9 @@ To that end, we're trying to support CSV, JSON, and perhaps other simpler file f
 **Note on Feather:** The Feather file format support can be considered as deprecated in favour of Arrow IPC. Though we have implemented Feather, it's meant to be a stop-gap measure until Arrow supports IPC (in Rust). We'll try tackle this in the coming months.
 
 - IO Support
-  - [ ] CSV
+  - [X] CSV (using Arrow)
     - [X] Read
-    - [ ] Write
+    - [X] Write
   - [ ] JSON
     - [X] Read (submitted to Arrow)
     - [ ] Write
@@ -50,13 +50,11 @@ To that end, we're trying to support CSV, JSON, and perhaps other simpler file f
 ### Functionality
 
 - DataFrame Operations
-  <!-- - [x] Read CSV into dataframe -->
   - [X] Select single column
-  - [ ] Select subset of columns, drop columns
+  - [X] Select subset of columns, drop columns
   - [X] Add or remove columns
   - [X] Rename columns
   - [ ] Create dataframe from record batches (a `Vec<RecordBatch>` as well as an iterator)
-  - [ ] Write dataframe to CSV (and other formats as and when Arrow supports them)
   - [ ] Sort dataframes
   - [ ] Grouped operations
   - [ ] Filter dataframes
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -17,50 +17,11 @@ use std::sync::Arc;
 
 use crate::error::DataFrameError;
 
-//impl From<&ArrayRef> for &PrimitiveArray<BooleanType> {
-//    fn from(array: &ArrayRef) -> Self {
-//        array.as_any().downcast_ref::<BooleanArray>().unwrap()
-//    }
-//}
-
-//impl<T: ArrowPrimitiveType> From<&Array> for &PrimitiveArray<T> {
-//    fn from(array: &Array) -> Self {
-//        match array.data_type() {
-//            DataType::Boolean => array.as_any().downcast_ref::<T>().unwrap()
-//        }
-////        _ => unimplemented!("Casting array to other primitive types is not implemented")
-//    }
-//}
-
-//fn array_to_primitive<T>(array: &Array) -> &PrimitiveArray<T>
-//    where
-//        T: ArrowPrimitiveType,
-//{
-//    match array.data_type() {
-//        DataType::Boolean => {
-//            array.as_any().downcast_ref::<BooleanArray>().unwrap()
-//        }
-//        _ => unimplemented!("Casting for other array types is not implemented")
-//    }
-//}
-
 pub struct DataFrame {
     schema: Arc<Schema>,
     columns: Vec<Column>,
 }
 
-// struct CsvDataSource {
-//     reader: CsvReader,
-// }
-
-// impl Iterator for CsvDataSource {
-//    type Item = Result<RecordBatch, DataFrameError>;
-
-//    fn next(&mut self) -> Result<Option<Self::Item>, arrow::error::ArrowError> {
-//        Some(Ok(self.reader.next()))
-//    }
-// }
-
 impl DataFrame {
     /// Create an empty `DataFrame`
     fn empty() -> Self {
@@ -173,38 +134,24 @@ impl DataFrame {
 
         arrays.into_iter().for_each(|array| {
             dbg!(array.len());
-            batches.push(RecordBatch::new(self.schema.clone(), array));
+            // the unwrap is infallible as we're passing data that's already been verified
+            batches.push(RecordBatch::try_new(self.schema.clone(), array).unwrap());
         });
 
         batches
     }
 
     /// Returns dataframe with the first n records selected
-    ///
-    /// TODO: this should work through batches, and slice the last one that makes
-    /// the length match what we're taking.
-    // fn take(&self, count: usize) -> Self {
-    //     DataFrame::new(
-    //         self.schema.clone(),
-    //         self.columns
-    //             .into_iter()
-    //             .map(|col| {
-    //                 ArrayDataBuilder::new(col.data_type().clone())
-    //                     .child_data(
-    //                         col.data()
-    //                             .child_data()
-    //                             .iter()
-    //                             .take(count)
-    //                             .into_iter()
-    //                             .map(|x| x.clone())
-    //                             .collect(),
-    //                     )
-    //                     .build()
-    //             })
-    //             .map(|col| utils::make_array(col))
-    //             .collect(),
-    //     )
-    // }
+    fn take(&self, count: usize) -> Self {
+        DataFrame::new(
+            self.schema.clone(),
+            self.columns
+                .clone()
+                .into_iter()
+                .map(|col| col.slice(0, Some(count)))
+                .collect(),
+        )
+    }
 
     fn intersect(&self, other: &DataFrame) -> Self {
         unimplemented!("Intersect not yet implemented")
@@ -213,100 +160,96 @@ impl DataFrame {
     /// Returns dataframe with specified columns selected.
     ///
     /// If a column name does not exist, it is omitted.
-    // pub fn select(&mut self, col_names: Vec<&str>) -> Self {
-    //     // get the names of columns from the schema, and match them with supplied
-    //     let mut col_num: i16 = -1;
-    //     let schema = &self.schema.clone();
-    //     let field_names: Vec<(usize, &str)> = schema
-    //         .fields()
-    //         .iter()
-    //         .map(|c| {
-    //             col_num += 1;
-    //             (col_num as usize, c.name().as_str())
-    //         })
-    //         .collect();
-
-    //     // filter names
-    //     let filter_cols: Vec<(usize, &str)> = if col_names.contains(&"*") {
-    //         field_names
-    //     } else {
-    //         // TODO follow the order of user-supplied column names
-    //         field_names
-    //             .into_iter()
-    //             .filter(|(col, name)| col_names.contains(name))
-    //             .collect()
-    //     };
-
-    //     // let columns = filter_cols.clone().iter().map(move |c| self.columns[c.0]).collect();
-
-    //     let mut columns = vec![];
-
-    //     for (i,u) in filter_cols.clone() {
-    //         let c = &self.columns[i];
-    //         columns.push(c);
-    //     }
-
-    //     let new_schema = Arc::new(Schema::new(
-    //         filter_cols
-    //             .iter()
-    //             .map(|c| schema.field(c.0).clone())
-    //             .collect(),
-    //     ));
-
-    //     dbg!(filter_cols);
-
-    //     DataFrame::from_columns(new_schema, columns)
-    // }
+    pub fn select(&mut self, col_names: Vec<&str>) -> Self {
+        // get the names of columns from the schema, and match them with supplied
+        let mut col_num: i16 = -1;
+        let schema = &self.schema.clone();
+        let field_names: Vec<(usize, &str)> = schema
+            .fields()
+            .iter()
+            .map(|c| {
+                col_num += 1;
+                (col_num as usize, c.name().as_str())
+            })
+            .collect();
+
+        // filter names
+        let filter_cols: Vec<(usize, &str)> = if col_names.contains(&"*") {
+            field_names
+        } else {
+            // TODO follow the order of user-supplied column names
+            field_names
+                .into_iter()
+                .filter(|(col, name)| col_names.contains(name))
+                .collect()
+        };
+
+        let mut columns = vec![];
+
+        for (i, u) in filter_cols.clone() {
+            let c = &self.columns[i];
+            columns.push(c.clone());
+        }
+
+        let new_schema = Arc::new(Schema::new(
+            filter_cols
+                .iter()
+                .map(|c| schema.field(c.0).clone())
+                .collect(),
+        ));
+
+        DataFrame::from_columns(new_schema, columns)
+    }
 
     /// Returns a dataframe with specified columns dropped.
     ///
     /// If a column name does not exist, it is omitted.
-    // pub fn drop(&self, col_names: Vec<&str>) -> Self {
-    //     // get the names of columns from the schema, and match them with supplied
-    //     let mut col_num: i16 = -1;
-    //     let schema = self.schema.clone();
-    //     let field_names: Vec<(usize, &str)> = schema
-    //         .fields()
-    //         .into_iter()
-    //         .map(|c| {
-    //             col_num += 1;
-    //             (col_num as usize, c.name().as_str())
-    //         })
-    //         .collect();
-
-    //     // filter names
-    //     let filter_cols: Vec<(usize, &str)> = {
-    //         // TODO follow the order of user-supplied column names
-    //         field_names
-    //             .into_iter()
-    //             .filter(|(col, name)| !col_names.contains(name))
-    //             .collect()
-    //     };
-
-    //     // construct dataframe with selected columns
-    //     DataFrame {
-    //         schema: Arc::new(Schema::new(
-    //             filter_cols
-    //                 .iter()
-    //                 .map(|c| schema.field(c.0).clone())
-    //                 .collect(),
-    //         )),
-    //         columns: filter_cols
-    //             .into_iter()
-    //             .map(move |c| self.columns[c.0])
-    //             .collect(),
-    //     }
-    // }
+    pub fn drop(&self, col_names: Vec<&str>) -> Self {
+        // get the names of columns from the schema, and match them with supplied
+        let mut col_num: i16 = -1;
+        let schema = self.schema.clone();
+        let field_names: Vec<(usize, &str)> = schema
+            .fields()
+            .into_iter()
+            .map(|c| {
+                col_num += 1;
+                (col_num as usize, c.name().as_str())
+            })
+            .collect();
+
+        // filter names
+        let filter_cols: Vec<(usize, &str)> = {
+            // TODO follow the order of user-supplied column names
+            field_names
+                .into_iter()
+                .filter(|(col, name)| !col_names.contains(name))
+                .collect()
+        };
+
+        // construct dataframe with selected columns
+        DataFrame {
+            schema: Arc::new(Schema::new(
+                filter_cols
+                    .iter()
+                    .map(|c| schema.field(c.0).clone())
+                    .collect(),
+            )),
+            columns: filter_cols
+                .iter()
+                .map(move |c| self.columns[c.0].clone())
+                .collect(),
+        }
+    }
 
     /// Create a dataframe from an Arrow Table.
     ///
     /// Arrow Tables are not yet in the Rust library, and we are hashing them out here
-    // pub fn from_table(table: crate::table::Table) -> Self {
-    //     DataFrame {
-    //         schema: table.schema().clone(),
-    //         columns: *table.columns(),
-    //     }
-    // }
+    pub fn from_table(table: crate::table::Table) -> Self {
+        DataFrame {
+            schema: table.schema().clone(),
+            columns: table.columns().to_vec(),
+        }
+    }
 
     pub fn from_csv(path: &str, schema: Option<Arc<Schema>>) -> Self {
         let file = File::open(path).unwrap();
@@ -428,6 +371,21 @@ impl DataFrame {
         Ok(())
     }
 
+    pub fn to_csv(&self, path: &str) -> Result<(), arrow::error::ArrowError> {
+        // use csv::error::Error;
+        use arrow::csv::Writer;
+
+        let file = File::create(path)?;
+
+        let wrt = Writer::new(file);
+
+        let batches = self.to_record_batches();
+        let batches_ref: Vec<&RecordBatch> = batches.iter().map(|b| b).collect();
+
+        wrt.write(batches_ref)?;
+
+        Ok(())
+    }
 }
 
 mod tests {
@@ -536,4 +494,37 @@ mod tests {
         let write = dataframe.to_feather("./test/data/uk_cities");
         assert!(write.is_ok());
     }
+
+    #[test]
+    fn csv_io() {
+        let mut dataframe = DataFrame::from_csv("./test/data/uk_cities_with_headers.csv", None);
+        let a = dataframe.column_by_name("lat");
+        let b = dataframe.column_by_name("lng");
+        let sum = ScalarFunctions::add(column_to_arrays_f64(a), column_to_arrays_f64(b));
+        // TODO, make this better
+        let sum: Vec<ArrayRef> = sum
+            .unwrap()
+            .into_iter()
+            .map(|p| Arc::new(p) as ArrayRef)
+            .collect();
+        dataframe = dataframe.with_column(
+            "lat_lng_sum",
+            Column::from_arrays(sum, Field::new("lat_lng_sum", DataType::Float64, true)),
+        );
+
+        let city = dataframe.column_by_name("city");
+        let lowercase = ScalarFunctions::lower(column_to_arrays_str(city));
+        let lowercase: Vec<ArrayRef> = lowercase
+            .unwrap()
+            .into_iter()
+            .map(|p| Arc::new(p) as ArrayRef)
+            .collect();
+        dataframe = dataframe.with_column(
+            "city_lower",
+            Column::from_arrays(lowercase, Field::new("city_lower", DataType::Utf8, true)),
+        );
+
+        let write = dataframe.to_csv("/tmp/uk_cities_out.csv");
+        assert!(write.is_ok());
+    }
 }
diff --git a/src/functions/scalar.rs b/src/functions/scalar.rs
@@ -247,7 +247,16 @@ impl ScalarFunctions {
     pub fn greatest() {}
     pub fn hash() {}
     pub fn hex() {}
-    pub fn hour() {}
+    pub fn hour<T>(array: Vec<&PrimitiveArray<T>>) -> Result<Vec<Int32Array>, ArrowError>
+    where
+        T: ArrowNumericType + ArrowTemporalType,
+        i64: std::convert::From<T::Native>
+    {
+        array
+            .iter()
+            .map(|a| compute::hour(a))
+            .collect()
+    }
     pub fn hypot<T>(
         a: &PrimitiveArray<T>,
         b: &PrimitiveArray<T>,
diff --git a/src/io/feather.rs b/src/io/feather.rs
diff --git a/src/table.rs b/src/table.rs