add ability to read from PostgreSQL table

nevi-me · nevi-me · commit 69e7e92208d4 · 2019-04-19T13:44:20.000+02:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,4 +12,6 @@ num-traits = "0.2"
 csv = "1"
 byteorder = "1"
 flatbuffers = "0.5"
-array_tool = "1"
+array_tool = "1"
+postgres = {version = "0.16.0-rc.1", features = ["default", "with-chrono-0_4", "with-uuid-0_7"]}
+chrono = "0.4"
diff --git a/README.md b/README.md
@@ -46,6 +46,16 @@ To that end, we're trying to support CSV, JSON, and perhaps other simpler file f
   - [ ] Feather
     - [X] Read
     - [X] Write (**do not use**, the current limitation with slicing arrays means we write each record batch as a file, instead of a single file for all the data)
+  - [ ] Arrow IPC
+    - [ ] Read File
+    - [ ] Write FIle
+  - [ ] SQL
+    - [ ] PostgreSQL
+      - [X] Read (ongoing, reading of most columns possible)
+      - [ ] Write
+    - [ ] MSSQL (using tiberius)
+      - [ ] Read
+      - [ ] Write
 
 ### Functionality
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -350,6 +350,27 @@ impl DataFrame {
         }
     }
 
+    /// Create a DataFrame from a SQL table
+    ///
+    /// Note: Only PostgreSQL is currently supported, and data is buffered in-memory before creating dataframe.
+    /// This might be undesirable if reading large tables. However, note that this library currently performs
+    /// eager evaluation, so the DataFrame would still be created and held in-memory. We will improve this with
+    /// a better execution model in future.
+    pub fn from_sql(connection_string: &str, table_name: &str) -> Self {
+        let batches =
+            crate::io::postgres::read_table(connection_string, table_name, 0, 1024).unwrap();
+        if batches.is_empty() {
+            DataFrame::empty()
+        } else {
+            let schema = batches.get(0).unwrap().schema().clone();
+            let table = crate::table::Table::from_record_batches(schema.clone(), batches);
+            DataFrame {
+                schema,
+                columns: table.columns,
+            }
+        }
+    }
+
     /// Write dataframe to a feather file
     ///
     /// Data is currently written as individual batches (as Arrow doesn't yet support slicing).
@@ -392,7 +413,7 @@ mod tests {
     use crate::dataframe::DataFrame;
     use crate::functions::scalar::ScalarFunctions;
     use crate::table::*;
-    use arrow::array::{Array, ArrayRef, Float64Array, PrimitiveArray};
+    use arrow::array::*;
     use arrow::datatypes::{DataType, Field, Float64Type};
     use std::sync::Arc;
 
@@ -412,6 +433,64 @@ mod tests {
         assert_eq!(37, dataframe.num_rows());
     }
 
+    #[test]
+    fn read_postgres_table_to_dataframe() {
+        // table created with:
+        // create table arrow_data
+        // (
+        //     int32 integer,
+        //     bool boolean,
+        //     int64 bigint,
+        //     string varchar(255),
+        //     timestamp timestamp,
+        //     time time
+        // );
+        let dataframe = DataFrame::from_sql(
+            "postgres://postgres:password@localhost:5432/postgres",
+            "public.arrow_data",
+        );
+        assert_eq!(6, dataframe.num_columns());
+        assert_eq!(1, dataframe.num_rows());
+
+        let col_1 = dataframe.column(0);
+        let col_2 = dataframe.column(1);
+        let col_3 = dataframe.column(2);
+        let col_4 = dataframe.column(3);
+        let col_5 = dataframe.column(4);
+        let col_6 = dataframe.column(5);
+        assert_eq!(
+            "PrimitiveArray<Int32>\n[\n  1,\n]",
+            format!("{:?}", Int32Array::from(col_1.data().chunks()[0].data()))
+        );
+        assert_eq!(
+            "PrimitiveArray<Boolean>\n[\n  true,\n]",
+            format!("{:?}", BooleanArray::from(col_2.data().chunks()[0].data()))
+        );
+        assert_eq!(
+            "PrimitiveArray<Int64>\n[\n  12345676354674,\n]",
+            format!("{:?}", Int64Array::from(col_3.data().chunks()[0].data()))
+        );
+        assert_eq!(
+            "lorem ipsum",
+            std::str::from_utf8(BinaryArray::from(col_4.data().chunks()[0].data()).value(0))
+                .unwrap()
+        );
+        assert_eq!(
+            "PrimitiveArray<Timestamp(Millisecond)>\n[\n  2019-04-19T10:41:13.591,\n]",
+            format!(
+                "{:?}",
+                TimestampMillisecondArray::from(col_5.data().chunks()[0].data())
+            )
+        );
+        assert_eq!(
+            "PrimitiveArray<Time64(Microsecond)>\n[\n  12:45:00,\n]",
+            format!(
+                "{:?}",
+                Time64MicrosecondArray::from(col_6.data().chunks()[0].data())
+            )
+        );
+    }
+
     #[test]
     fn dataframe_ops() {
         let mut dataframe = DataFrame::from_csv("./test/data/uk_cities_with_headers.csv", None);
diff --git a/src/io/mod.rs b/src/io/mod.rs
@@ -1,2 +1,3 @@
 pub mod feather;
 pub mod feather_generated;
+pub mod postgres;
diff --git a/src/io/postgres.rs b/src/io/postgres.rs
@@ -0,0 +1,179 @@
+//! An experimental interface for reading and writing record batches to and from PostgreSQL
+
+use arrow::builder::*;
+use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use arrow::record_batch::RecordBatch;
+use chrono::Timelike;
+use postgres::types::*;
+use postgres::{Client, NoTls, Row};
+
+fn pg_to_arrow_type(dt: &Type) -> Option<DataType> {
+    match dt {
+        &Type::BOOL => Some(DataType::Boolean),
+        &Type::BYTEA | &Type::CHAR | &Type::NAME | &Type::TEXT | &Type::VARCHAR => {
+            Some(DataType::Utf8)
+        }
+        &Type::INT8 => Some(DataType::Int64),
+        &Type::INT2 => Some(DataType::Int16),
+        &Type::INT4 => Some(DataType::Int32),
+        //        &OID => None,
+        //        &JSON => None,
+        &Type::FLOAT4 => Some(DataType::Float32),
+        &Type::FLOAT8 => Some(DataType::Float64),
+        //        &ABSTIME => None,
+        //        &RELTIME => None,
+        //        &TINTERVAL => None,
+        //        &MONEY => None,
+        &Type::BOOL_ARRAY => Some(DataType::List(Box::new(DataType::Boolean))),
+        &Type::BYTEA_ARRAY | &Type::CHAR_ARRAY | &Type::NAME_ARRAY => {
+            Some(DataType::List(Box::new(DataType::Utf8)))
+        }
+        //        &INT2_ARRAY => None,
+        //        &INT2_VECTOR => None,
+        //        &INT2_VECTOR_ARRAY => None,
+        //        &INT4_ARRAY => None,
+        //        &TEXT_ARRAY => None,
+        //        &INT8_ARRAY => None,
+        //        &FLOAT4_ARRAY => None,
+        //        &FLOAT8_ARRAY => None,
+        //        &ABSTIME_ARRAY => None,
+        //        &RELTIME_ARRAY => None,
+        //        &TINTERVAL_ARRAY => None,
+        //        &DATE => None,
+        &Type::TIME => Some(DataType::Time64(TimeUnit::Microsecond)),
+        &Type::TIMESTAMP => Some(DataType::Timestamp(TimeUnit::Millisecond)),
+        //        &TIMESTAMP_ARRAY => None,
+        //        &DATE_ARRAY => None,
+        //        &TIME_ARRAY => None,
+        //        &TIMESTAMPTZ => None,
+        //        &TIMESTAMPTZ_ARRAY => None,
+        //        &INTERVAL => None,
+        //        &INTERVAL_ARRAY => None,
+        //        &NUMERIC_ARRAY => None,
+        //        &TIMETZ => None,
+        //        &BIT => None,
+        //        &BIT_ARRAY => None,
+        //        &VARBIT => None,
+        //        &NUMERIC => None,
+        //        &UUID => None,
+        t @ _ => panic!("Postgres type {:?} not supported", t),
+    }
+}
+
+fn from_field(f: &Field, capacity: usize) -> Box<ArrayBuilder> {
+    match f.data_type() {
+        DataType::Boolean => Box::new(BooleanBuilder::new(capacity)),
+        DataType::Int8 => Box::new(Int8Builder::new(capacity)),
+        DataType::Int16 => Box::new(Int16Builder::new(capacity)),
+        DataType::Int32 => Box::new(Int32Builder::new(capacity)),
+        DataType::Int64 => Box::new(Int64Builder::new(capacity)),
+        DataType::UInt8 => Box::new(UInt8Builder::new(capacity)),
+        DataType::UInt16 => Box::new(UInt16Builder::new(capacity)),
+        DataType::UInt32 => Box::new(UInt32Builder::new(capacity)),
+        DataType::UInt64 => Box::new(UInt64Builder::new(capacity)),
+        DataType::Float32 => Box::new(Float32Builder::new(capacity)),
+        DataType::Float64 => Box::new(Float64Builder::new(capacity)),
+        DataType::Utf8 => Box::new(BinaryBuilder::new(capacity)),
+        t @ _ => panic!("Data type {:?} is not currently supported", t),
+    }
+}
+
+// TODO can make this a common trait for DB sources
+pub fn read_table(
+    connection_string: &str,
+    table_name: &str,
+    limit: usize,
+    batch_size: usize,
+) -> Result<Vec<RecordBatch>, ()> {
+    // create connection
+    let mut client = Client::connect(connection_string, NoTls).unwrap();
+    let results = client
+        .query(format!("SELECT * FROM {}", table_name).as_str(), &[])
+        .unwrap();
+    if results.is_empty() {
+        return Ok(vec![]);
+    }
+    let schema = row_to_schema(results.get(0).unwrap()).unwrap();
+    let field_len = schema.fields().len();
+    let mut builder = StructBuilder::from_schema(schema.clone(), batch_size);
+    let chunks = results.chunks(batch_size);
+    let mut batches = vec![];
+    chunks.for_each(|chunk: &[Row]| {
+        for j in 0..field_len {
+            match schema.field(j).data_type() {
+                DataType::Int32 => {
+                    let field_builder = builder.field_builder::<Int32Builder>(j).unwrap();
+                    for i in 0..chunk.len() {
+                        let row: &Row = chunk.get(i).unwrap();
+                        field_builder.append_value(row.get(j)).unwrap();
+                    }
+                }
+                DataType::Int64 => {
+                    let field_builder = builder.field_builder::<Int64Builder>(j).unwrap();
+                    for i in 0..chunk.len() {
+                        let row: &Row = chunk.get(i).unwrap();
+                        field_builder.append_value(row.get(j)).unwrap();
+                    }
+                }
+                DataType::Timestamp(TimeUnit::Millisecond) => {
+                    let field_builder = builder
+                        .field_builder::<TimestampMillisecondBuilder>(j)
+                        .unwrap();
+                    for i in 0..chunk.len() {
+                        let row: &Row = chunk.get(i).unwrap();
+                        let timestamp: chrono::NaiveDateTime = row.get(j);
+                        field_builder
+                            .append_value(timestamp.timestamp_millis())
+                            .unwrap();
+                    }
+                }
+                DataType::Time64(TimeUnit::Microsecond) => {
+                    let field_builder = builder
+                        .field_builder::<Time64MicrosecondBuilder>(j)
+                        .unwrap();
+                    for i in 0..chunk.len() {
+                        let row: &Row = chunk.get(i).unwrap();
+                        let time: chrono::NaiveTime = row.get(j);
+                        field_builder
+                            .append_value(
+                                time.num_seconds_from_midnight() as i64 * 1000000
+                                    + time.nanosecond() as i64 / 1000,
+                            )
+                            .unwrap();
+                    }
+                }
+                DataType::Boolean => {
+                    let field_builder = builder.field_builder::<BooleanBuilder>(j).unwrap();
+                    for i in 0..chunk.len() {
+                        let row: &Row = chunk.get(i).unwrap();
+                        field_builder.append_value(row.get(j)).unwrap();
+                    }
+                }
+                DataType::Utf8 => {
+                    let field_builder = builder.field_builder::<BinaryBuilder>(j).unwrap();
+                    for i in 0..chunk.len() {
+                        let row: &Row = chunk.get(i).unwrap();
+                        field_builder.append_string(row.get(j)).unwrap();
+                    }
+                }
+                t @ _ => panic!("Field builder for {:?} not yet supported", t),
+            }
+        }
+        builder.append(true).unwrap();
+        batches.push(builder.finish().flatten());
+    });
+    Ok(batches)
+}
+
+fn populate_builder() {}
+
+fn row_to_schema(row: &postgres::Row) -> Result<Schema, ()> {
+    let fields = row
+        .columns()
+        .iter()
+        .map(|col: &postgres::Column| {
+            Field::new(col.name(), pg_to_arrow_type(col.type_()).unwrap(), true)
+        })
+        .collect();
+    Ok(Schema::new(fields))
+}

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`pub mod feather;`
`2`	`2`	`pub mod feather_generated;`
	`3`	`+pub mod postgres;`