Skip to content
This repository was archived by the owner on Dec 29, 2021. It is now read-only.

Commit 69e7e92

Browse files
committed
add ability to read from PostgreSQL table
1 parent 02c565a commit 69e7e92

File tree

5 files changed

+273
-2
lines changed

5 files changed

+273
-2
lines changed

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,6 @@ num-traits = "0.2"
1212
csv = "1"
1313
byteorder = "1"
1414
flatbuffers = "0.5"
15-
array_tool = "1"
15+
array_tool = "1"
16+
postgres = {version = "0.16.0-rc.1", features = ["default", "with-chrono-0_4", "with-uuid-0_7"]}
17+
chrono = "0.4"

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,16 @@ To that end, we're trying to support CSV, JSON, and perhaps other simpler file f
4646
- [ ] Feather
4747
- [X] Read
4848
- [X] Write (**do not use**, the current limitation with slicing arrays means we write each record batch as a file, instead of a single file for all the data)
49+
- [ ] Arrow IPC
50+
- [ ] Read File
51+
- [ ] Write FIle
52+
- [ ] SQL
53+
- [ ] PostgreSQL
54+
- [X] Read (ongoing, reading of most columns possible)
55+
- [ ] Write
56+
- [ ] MSSQL (using tiberius)
57+
- [ ] Read
58+
- [ ] Write
4959

5060
### Functionality
5161

src/dataframe.rs

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,27 @@ impl DataFrame {
350350
}
351351
}
352352

353+
/// Create a DataFrame from a SQL table
354+
///
355+
/// Note: Only PostgreSQL is currently supported, and data is buffered in-memory before creating dataframe.
356+
/// This might be undesirable if reading large tables. However, note that this library currently performs
357+
/// eager evaluation, so the DataFrame would still be created and held in-memory. We will improve this with
358+
/// a better execution model in future.
359+
pub fn from_sql(connection_string: &str, table_name: &str) -> Self {
360+
let batches =
361+
crate::io::postgres::read_table(connection_string, table_name, 0, 1024).unwrap();
362+
if batches.is_empty() {
363+
DataFrame::empty()
364+
} else {
365+
let schema = batches.get(0).unwrap().schema().clone();
366+
let table = crate::table::Table::from_record_batches(schema.clone(), batches);
367+
DataFrame {
368+
schema,
369+
columns: table.columns,
370+
}
371+
}
372+
}
373+
353374
/// Write dataframe to a feather file
354375
///
355376
/// Data is currently written as individual batches (as Arrow doesn't yet support slicing).
@@ -392,7 +413,7 @@ mod tests {
392413
use crate::dataframe::DataFrame;
393414
use crate::functions::scalar::ScalarFunctions;
394415
use crate::table::*;
395-
use arrow::array::{Array, ArrayRef, Float64Array, PrimitiveArray};
416+
use arrow::array::*;
396417
use arrow::datatypes::{DataType, Field, Float64Type};
397418
use std::sync::Arc;
398419

@@ -412,6 +433,64 @@ mod tests {
412433
assert_eq!(37, dataframe.num_rows());
413434
}
414435

436+
#[test]
437+
fn read_postgres_table_to_dataframe() {
438+
// table created with:
439+
// create table arrow_data
440+
// (
441+
// int32 integer,
442+
// bool boolean,
443+
// int64 bigint,
444+
// string varchar(255),
445+
// timestamp timestamp,
446+
// time time
447+
// );
448+
let dataframe = DataFrame::from_sql(
449+
"postgres://postgres:password@localhost:5432/postgres",
450+
"public.arrow_data",
451+
);
452+
assert_eq!(6, dataframe.num_columns());
453+
assert_eq!(1, dataframe.num_rows());
454+
455+
let col_1 = dataframe.column(0);
456+
let col_2 = dataframe.column(1);
457+
let col_3 = dataframe.column(2);
458+
let col_4 = dataframe.column(3);
459+
let col_5 = dataframe.column(4);
460+
let col_6 = dataframe.column(5);
461+
assert_eq!(
462+
"PrimitiveArray<Int32>\n[\n 1,\n]",
463+
format!("{:?}", Int32Array::from(col_1.data().chunks()[0].data()))
464+
);
465+
assert_eq!(
466+
"PrimitiveArray<Boolean>\n[\n true,\n]",
467+
format!("{:?}", BooleanArray::from(col_2.data().chunks()[0].data()))
468+
);
469+
assert_eq!(
470+
"PrimitiveArray<Int64>\n[\n 12345676354674,\n]",
471+
format!("{:?}", Int64Array::from(col_3.data().chunks()[0].data()))
472+
);
473+
assert_eq!(
474+
"lorem ipsum",
475+
std::str::from_utf8(BinaryArray::from(col_4.data().chunks()[0].data()).value(0))
476+
.unwrap()
477+
);
478+
assert_eq!(
479+
"PrimitiveArray<Timestamp(Millisecond)>\n[\n 2019-04-19T10:41:13.591,\n]",
480+
format!(
481+
"{:?}",
482+
TimestampMillisecondArray::from(col_5.data().chunks()[0].data())
483+
)
484+
);
485+
assert_eq!(
486+
"PrimitiveArray<Time64(Microsecond)>\n[\n 12:45:00,\n]",
487+
format!(
488+
"{:?}",
489+
Time64MicrosecondArray::from(col_6.data().chunks()[0].data())
490+
)
491+
);
492+
}
493+
415494
#[test]
416495
fn dataframe_ops() {
417496
let mut dataframe = DataFrame::from_csv("./test/data/uk_cities_with_headers.csv", None);

src/io/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod feather;
22
pub mod feather_generated;
3+
pub mod postgres;

src/io/postgres.rs

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
//! An experimental interface for reading and writing record batches to and from PostgreSQL
2+
3+
use arrow::builder::*;
4+
use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
5+
use arrow::record_batch::RecordBatch;
6+
use chrono::Timelike;
7+
use postgres::types::*;
8+
use postgres::{Client, NoTls, Row};
9+
10+
fn pg_to_arrow_type(dt: &Type) -> Option<DataType> {
11+
match dt {
12+
&Type::BOOL => Some(DataType::Boolean),
13+
&Type::BYTEA | &Type::CHAR | &Type::NAME | &Type::TEXT | &Type::VARCHAR => {
14+
Some(DataType::Utf8)
15+
}
16+
&Type::INT8 => Some(DataType::Int64),
17+
&Type::INT2 => Some(DataType::Int16),
18+
&Type::INT4 => Some(DataType::Int32),
19+
// &OID => None,
20+
// &JSON => None,
21+
&Type::FLOAT4 => Some(DataType::Float32),
22+
&Type::FLOAT8 => Some(DataType::Float64),
23+
// &ABSTIME => None,
24+
// &RELTIME => None,
25+
// &TINTERVAL => None,
26+
// &MONEY => None,
27+
&Type::BOOL_ARRAY => Some(DataType::List(Box::new(DataType::Boolean))),
28+
&Type::BYTEA_ARRAY | &Type::CHAR_ARRAY | &Type::NAME_ARRAY => {
29+
Some(DataType::List(Box::new(DataType::Utf8)))
30+
}
31+
// &INT2_ARRAY => None,
32+
// &INT2_VECTOR => None,
33+
// &INT2_VECTOR_ARRAY => None,
34+
// &INT4_ARRAY => None,
35+
// &TEXT_ARRAY => None,
36+
// &INT8_ARRAY => None,
37+
// &FLOAT4_ARRAY => None,
38+
// &FLOAT8_ARRAY => None,
39+
// &ABSTIME_ARRAY => None,
40+
// &RELTIME_ARRAY => None,
41+
// &TINTERVAL_ARRAY => None,
42+
// &DATE => None,
43+
&Type::TIME => Some(DataType::Time64(TimeUnit::Microsecond)),
44+
&Type::TIMESTAMP => Some(DataType::Timestamp(TimeUnit::Millisecond)),
45+
// &TIMESTAMP_ARRAY => None,
46+
// &DATE_ARRAY => None,
47+
// &TIME_ARRAY => None,
48+
// &TIMESTAMPTZ => None,
49+
// &TIMESTAMPTZ_ARRAY => None,
50+
// &INTERVAL => None,
51+
// &INTERVAL_ARRAY => None,
52+
// &NUMERIC_ARRAY => None,
53+
// &TIMETZ => None,
54+
// &BIT => None,
55+
// &BIT_ARRAY => None,
56+
// &VARBIT => None,
57+
// &NUMERIC => None,
58+
// &UUID => None,
59+
t @ _ => panic!("Postgres type {:?} not supported", t),
60+
}
61+
}
62+
63+
fn from_field(f: &Field, capacity: usize) -> Box<ArrayBuilder> {
64+
match f.data_type() {
65+
DataType::Boolean => Box::new(BooleanBuilder::new(capacity)),
66+
DataType::Int8 => Box::new(Int8Builder::new(capacity)),
67+
DataType::Int16 => Box::new(Int16Builder::new(capacity)),
68+
DataType::Int32 => Box::new(Int32Builder::new(capacity)),
69+
DataType::Int64 => Box::new(Int64Builder::new(capacity)),
70+
DataType::UInt8 => Box::new(UInt8Builder::new(capacity)),
71+
DataType::UInt16 => Box::new(UInt16Builder::new(capacity)),
72+
DataType::UInt32 => Box::new(UInt32Builder::new(capacity)),
73+
DataType::UInt64 => Box::new(UInt64Builder::new(capacity)),
74+
DataType::Float32 => Box::new(Float32Builder::new(capacity)),
75+
DataType::Float64 => Box::new(Float64Builder::new(capacity)),
76+
DataType::Utf8 => Box::new(BinaryBuilder::new(capacity)),
77+
t @ _ => panic!("Data type {:?} is not currently supported", t),
78+
}
79+
}
80+
81+
// TODO can make this a common trait for DB sources
82+
pub fn read_table(
83+
connection_string: &str,
84+
table_name: &str,
85+
limit: usize,
86+
batch_size: usize,
87+
) -> Result<Vec<RecordBatch>, ()> {
88+
// create connection
89+
let mut client = Client::connect(connection_string, NoTls).unwrap();
90+
let results = client
91+
.query(format!("SELECT * FROM {}", table_name).as_str(), &[])
92+
.unwrap();
93+
if results.is_empty() {
94+
return Ok(vec![]);
95+
}
96+
let schema = row_to_schema(results.get(0).unwrap()).unwrap();
97+
let field_len = schema.fields().len();
98+
let mut builder = StructBuilder::from_schema(schema.clone(), batch_size);
99+
let chunks = results.chunks(batch_size);
100+
let mut batches = vec![];
101+
chunks.for_each(|chunk: &[Row]| {
102+
for j in 0..field_len {
103+
match schema.field(j).data_type() {
104+
DataType::Int32 => {
105+
let field_builder = builder.field_builder::<Int32Builder>(j).unwrap();
106+
for i in 0..chunk.len() {
107+
let row: &Row = chunk.get(i).unwrap();
108+
field_builder.append_value(row.get(j)).unwrap();
109+
}
110+
}
111+
DataType::Int64 => {
112+
let field_builder = builder.field_builder::<Int64Builder>(j).unwrap();
113+
for i in 0..chunk.len() {
114+
let row: &Row = chunk.get(i).unwrap();
115+
field_builder.append_value(row.get(j)).unwrap();
116+
}
117+
}
118+
DataType::Timestamp(TimeUnit::Millisecond) => {
119+
let field_builder = builder
120+
.field_builder::<TimestampMillisecondBuilder>(j)
121+
.unwrap();
122+
for i in 0..chunk.len() {
123+
let row: &Row = chunk.get(i).unwrap();
124+
let timestamp: chrono::NaiveDateTime = row.get(j);
125+
field_builder
126+
.append_value(timestamp.timestamp_millis())
127+
.unwrap();
128+
}
129+
}
130+
DataType::Time64(TimeUnit::Microsecond) => {
131+
let field_builder = builder
132+
.field_builder::<Time64MicrosecondBuilder>(j)
133+
.unwrap();
134+
for i in 0..chunk.len() {
135+
let row: &Row = chunk.get(i).unwrap();
136+
let time: chrono::NaiveTime = row.get(j);
137+
field_builder
138+
.append_value(
139+
time.num_seconds_from_midnight() as i64 * 1000000
140+
+ time.nanosecond() as i64 / 1000,
141+
)
142+
.unwrap();
143+
}
144+
}
145+
DataType::Boolean => {
146+
let field_builder = builder.field_builder::<BooleanBuilder>(j).unwrap();
147+
for i in 0..chunk.len() {
148+
let row: &Row = chunk.get(i).unwrap();
149+
field_builder.append_value(row.get(j)).unwrap();
150+
}
151+
}
152+
DataType::Utf8 => {
153+
let field_builder = builder.field_builder::<BinaryBuilder>(j).unwrap();
154+
for i in 0..chunk.len() {
155+
let row: &Row = chunk.get(i).unwrap();
156+
field_builder.append_string(row.get(j)).unwrap();
157+
}
158+
}
159+
t @ _ => panic!("Field builder for {:?} not yet supported", t),
160+
}
161+
}
162+
builder.append(true).unwrap();
163+
batches.push(builder.finish().flatten());
164+
});
165+
Ok(batches)
166+
}
167+
168+
fn populate_builder() {}
169+
170+
fn row_to_schema(row: &postgres::Row) -> Result<Schema, ()> {
171+
let fields = row
172+
.columns()
173+
.iter()
174+
.map(|col: &postgres::Column| {
175+
Field::new(col.name(), pg_to_arrow_type(col.type_()).unwrap(), true)
176+
})
177+
.collect();
178+
Ok(Schema::new(fields))
179+
}

0 commit comments

Comments
 (0)