Skip to content

Commit 7c013dd

Browse files
Volodymyr OrlovVolodymyr Orlov
authored andcommitted
feat: refactors dataset
1 parent d28f13d commit 7c013dd

File tree

10 files changed

+145
-6617
lines changed

10 files changed

+145
-6617
lines changed

src/dataset/boston.rs

Lines changed: 17 additions & 534 deletions
Large diffs are not rendered by default.

src/dataset/boston.xy

27.7 KB
Binary file not shown.

src/dataset/breast_cancer.rs

Lines changed: 18 additions & 1579 deletions
Large diffs are not rendered by default.

src/dataset/breast_cancer.xy

68.9 KB
Binary file not shown.

src/dataset/diabetes.rs

Lines changed: 17 additions & 4456 deletions
Large diffs are not rendered by default.

src/dataset/diabetes.xy

19 KB
Binary file not shown.

src/dataset/iris.rs

Lines changed: 16 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,61 +15,21 @@
1515
//! | Petal width | Numerical | No |
1616
//! | Class | Nominal | Yes |
1717
//!
18+
use crate::dataset::deserialize_data;
1819
use crate::dataset::Dataset;
1920

2021
/// Get dataset
2122
pub fn load_dataset() -> Dataset<f32, f32> {
22-
let x = vec![
23-
5.1, 3.5, 1.4, 0.2, 4.9, 3.0, 1.4, 0.2, 4.7, 3.2, 1.3, 0.2, 4.6, 3.1, 1.5, 0.2, 5.0, 3.6,
24-
1.4, 0.2, 5.4, 3.9, 1.7, 0.4, 4.6, 3.4, 1.4, 0.3, 5.0, 3.4, 1.5, 0.2, 4.4, 2.9, 1.4, 0.2,
25-
4.9, 3.1, 1.5, 0.1, 5.4, 3.7, 1.5, 0.2, 4.8, 3.4, 1.6, 0.2, 4.8, 3.0, 1.4, 0.1, 4.3, 3.0,
26-
1.1, 0.1, 5.8, 4.0, 1.2, 0.2, 5.7, 4.4, 1.5, 0.4, 5.4, 3.9, 1.3, 0.4, 5.1, 3.5, 1.4, 0.3,
27-
5.7, 3.8, 1.7, 0.3, 5.1, 3.8, 1.5, 0.3, 5.4, 3.4, 1.7, 0.2, 5.1, 3.7, 1.5, 0.4, 4.6, 3.6,
28-
1.0, 0.2, 5.1, 3.3, 1.7, 0.5, 4.8, 3.4, 1.9, 0.2, 5.0, 3.0, 1.6, 0.2, 5.0, 3.4, 1.6, 0.4,
29-
5.2, 3.5, 1.5, 0.2, 5.2, 3.4, 1.4, 0.2, 4.7, 3.2, 1.6, 0.2, 4.8, 3.1, 1.6, 0.2, 5.4, 3.4,
30-
1.5, 0.4, 5.2, 4.1, 1.5, 0.1, 5.5, 4.2, 1.4, 0.2, 4.9, 3.1, 1.5, 0.1, 5.0, 3.2, 1.2, 0.2,
31-
5.5, 3.5, 1.3, 0.2, 4.9, 3.1, 1.5, 0.1, 4.4, 3.0, 1.3, 0.2, 5.1, 3.4, 1.5, 0.2, 5.0, 3.5,
32-
1.3, 0.3, 4.5, 2.3, 1.3, 0.3, 4.4, 3.2, 1.3, 0.2, 5.0, 3.5, 1.6, 0.6, 5.1, 3.8, 1.9, 0.4,
33-
4.8, 3.0, 1.4, 0.3, 5.1, 3.8, 1.6, 0.2, 4.6, 3.2, 1.4, 0.2, 5.3, 3.7, 1.5, 0.2, 5.0, 3.3,
34-
1.4, 0.2, 7.0, 3.2, 4.7, 1.4, 6.4, 3.2, 4.5, 1.5, 6.9, 3.1, 4.9, 1.5, 5.5, 2.3, 4.0, 1.3,
35-
6.5, 2.8, 4.6, 1.5, 5.7, 2.8, 4.5, 1.3, 6.3, 3.3, 4.7, 1.6, 4.9, 2.4, 3.3, 1.0, 6.6, 2.9,
36-
4.6, 1.3, 5.2, 2.7, 3.9, 1.4, 5.0, 2.0, 3.5, 1.0, 5.9, 3.0, 4.2, 1.5, 6.0, 2.2, 4.0, 1.0,
37-
6.1, 2.9, 4.7, 1.4, 5.6, 2.9, 3.6, 1.3, 6.7, 3.1, 4.4, 1.4, 5.6, 3.0, 4.5, 1.5, 5.8, 2.7,
38-
4.1, 1.0, 6.2, 2.2, 4.5, 1.5, 5.6, 2.5, 3.9, 1.1, 5.9, 3.2, 4.8, 1.8, 6.1, 2.8, 4.0, 1.3,
39-
6.3, 2.5, 4.9, 1.5, 6.1, 2.8, 4.7, 1.2, 6.4, 2.9, 4.3, 1.3, 6.6, 3.0, 4.4, 1.4, 6.8, 2.8,
40-
4.8, 1.4, 6.7, 3.0, 5.0, 1.7, 6.0, 2.9, 4.5, 1.5, 5.7, 2.6, 3.5, 1.0, 5.5, 2.4, 3.8, 1.1,
41-
5.5, 2.4, 3.7, 1.0, 5.8, 2.7, 3.9, 1.2, 6.0, 2.7, 5.1, 1.6, 5.4, 3.0, 4.5, 1.5, 6.0, 3.4,
42-
4.5, 1.6, 6.7, 3.1, 4.7, 1.5, 6.3, 2.3, 4.4, 1.3, 5.6, 3.0, 4.1, 1.3, 5.5, 2.5, 4.0, 1.3,
43-
5.5, 2.6, 4.4, 1.2, 6.1, 3.0, 4.6, 1.4, 5.8, 2.6, 4.0, 1.2, 5.0, 2.3, 3.3, 1.0, 5.6, 2.7,
44-
4.2, 1.3, 5.7, 3.0, 4.2, 1.2, 5.7, 2.9, 4.2, 1.3, 6.2, 2.9, 4.3, 1.3, 5.1, 2.5, 3.0, 1.1,
45-
5.7, 2.8, 4.1, 1.3, 6.3, 3.3, 6.0, 2.5, 5.8, 2.7, 5.1, 1.9, 7.1, 3.0, 5.9, 2.1, 6.3, 2.9,
46-
5.6, 1.8, 6.5, 3.0, 5.8, 2.2, 7.6, 3.0, 6.6, 2.1, 4.9, 2.5, 4.5, 1.7, 7.3, 2.9, 6.3, 1.8,
47-
6.7, 2.5, 5.8, 1.8, 7.2, 3.6, 6.1, 2.5, 6.5, 3.2, 5.1, 2.0, 6.4, 2.7, 5.3, 1.9, 6.8, 3.0,
48-
5.5, 2.1, 5.7, 2.5, 5.0, 2.0, 5.8, 2.8, 5.1, 2.4, 6.4, 3.2, 5.3, 2.3, 6.5, 3.0, 5.5, 1.8,
49-
7.7, 3.8, 6.7, 2.2, 7.7, 2.6, 6.9, 2.3, 6.0, 2.2, 5.0, 1.5, 6.9, 3.2, 5.7, 2.3, 5.6, 2.8,
50-
4.9, 2.0, 7.7, 2.8, 6.7, 2.0, 6.3, 2.7, 4.9, 1.8, 6.7, 3.3, 5.7, 2.1, 7.2, 3.2, 6.0, 1.8,
51-
6.2, 2.8, 4.8, 1.8, 6.1, 3.0, 4.9, 1.8, 6.4, 2.8, 5.6, 2.1, 7.2, 3.0, 5.8, 1.6, 7.4, 2.8,
52-
6.1, 1.9, 7.9, 3.8, 6.4, 2.0, 6.4, 2.8, 5.6, 2.2, 6.3, 2.8, 5.1, 1.5, 6.1, 2.6, 5.6, 1.4,
53-
7.7, 3.0, 6.1, 2.3, 6.3, 3.4, 5.6, 2.4, 6.4, 3.1, 5.5, 1.8, 6.0, 3.0, 4.8, 1.8, 6.9, 3.1,
54-
5.4, 2.1, 6.7, 3.1, 5.6, 2.4, 6.9, 3.1, 5.1, 2.3, 5.8, 2.7, 5.1, 1.9, 6.8, 3.2, 5.9, 2.3,
55-
6.7, 3.3, 5.7, 2.5, 6.7, 3.0, 5.2, 2.3, 6.3, 2.5, 5.0, 1.9, 6.5, 3.0, 5.2, 2.0, 6.2, 3.4,
56-
5.4, 2.3, 5.9, 3.0, 5.1, 1.8,
57-
];
58-
59-
let setosa = std::iter::repeat(0f32).take(50);
60-
let versicolor = std::iter::repeat(1f32).take(50);
61-
let virginica = std::iter::repeat(2f32).take(50);
62-
let y = setosa
63-
.chain(versicolor)
64-
.chain(virginica)
65-
.collect::<Vec<f32>>();
66-
let shape = (150, 4);
23+
let (x, y, num_samples, num_features) = match deserialize_data(std::include_bytes!("iris.xy")) {
24+
Err(why) => panic!("Can't deserialize iris.xy. {}", why),
25+
Ok((x, y, num_samples, num_features)) => (x, y, num_samples, num_features),
26+
};
6727

6828
Dataset {
6929
data: x,
7030
target: y,
71-
num_samples: shape.0,
72-
num_features: shape.1,
31+
num_samples: num_samples,
32+
num_features: num_features,
7333
feature_names: vec![
7434
"sepal length (cm)",
7535
"sepal width (cm)",
@@ -90,8 +50,17 @@ pub fn load_dataset() -> Dataset<f32, f32> {
9050
#[cfg(test)]
9151
mod tests {
9252

53+
use super::super::*;
9354
use super::*;
9455

56+
#[test]
57+
#[ignore]
58+
fn refresh_iris_dataset() {
59+
// run this test to generate iris.xy file.
60+
let dataset = load_dataset();
61+
assert!(serialize_data(&dataset, "iris.xy").is_ok());
62+
}
63+
9564
#[test]
9665
fn iris_dataset() {
9766
let dataset = load_dataset();

src/dataset/iris.xy

2.95 KB
Binary file not shown.

src/dataset/mod.rs

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
//! Datasets
22
//!
33
//! In this module you will find small datasets that are used in SmartCore for demonstration purpose mostly.
4-
54
pub mod boston;
65
pub mod breast_cancer;
76
pub mod diabetes;
87
pub mod iris;
98

9+
use crate::math::num::RealNumber;
10+
use std::fs::File;
11+
use std::io;
12+
use std::io::prelude::*;
13+
1014
/// Dataset
1115
pub struct Dataset<X, Y> {
1216
/// data in one-dimensional array.
@@ -42,6 +46,67 @@ impl<X, Y> Dataset<X, Y> {
4246
}
4347
}
4448

49+
#[allow(dead_code)]
50+
pub(crate) fn serialize_data<X: RealNumber, Y: RealNumber>(
51+
dataset: &Dataset<X, Y>,
52+
filename: &str,
53+
) -> Result<(), io::Error> {
54+
match File::create(filename) {
55+
Ok(mut file) => {
56+
file.write(&dataset.num_features.to_le_bytes())?;
57+
file.write(&dataset.num_samples.to_le_bytes())?;
58+
let x: Vec<u8> = dataset
59+
.data
60+
.iter()
61+
.map(|v| *v)
62+
.flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec().into_iter())
63+
.collect();
64+
file.write_all(&x)?;
65+
let y: Vec<u8> = dataset
66+
.target
67+
.iter()
68+
.map(|v| *v)
69+
.flat_map(|f| f.to_f32_bits().to_le_bytes().to_vec().into_iter())
70+
.collect();
71+
file.write_all(&y)?;
72+
}
73+
Err(why) => panic!("couldn't create {}: {}", filename, why),
74+
}
75+
Ok(())
76+
}
77+
78+
pub(crate) fn deserialize_data(
79+
bytes: &[u8],
80+
) -> Result<(Vec<f32>, Vec<f32>, usize, usize), io::Error> {
81+
// read the same file back into a Vec of bytes
82+
let (num_samples, num_features) = {
83+
let mut buffer = [0u8; 8];
84+
buffer.copy_from_slice(&bytes[0..8]);
85+
let num_features = usize::from_le_bytes(buffer);
86+
buffer.copy_from_slice(&bytes[8..16]);
87+
let num_samples = usize::from_le_bytes(buffer);
88+
(num_samples, num_features)
89+
};
90+
91+
let mut x = Vec::with_capacity(num_samples * num_features);
92+
let mut y = Vec::with_capacity(num_samples);
93+
94+
let mut buffer = [0u8; 4];
95+
let mut c = 16;
96+
for _ in 0..(num_samples * num_features) {
97+
buffer.copy_from_slice(&bytes[c..(c + 4)]);
98+
x.push(f32::from_bits(u32::from_le_bytes(buffer)));
99+
c += 4;
100+
}
101+
102+
for _ in 0..(num_samples) {
103+
buffer.copy_from_slice(&bytes[c..(c + 4)]);
104+
y.push(f32::from_bits(u32::from_le_bytes(buffer)));
105+
}
106+
107+
Ok((x, y, num_samples, num_features))
108+
}
109+
45110
#[cfg(test)]
46111
mod tests {
47112
use super::*;

src/math/num.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ pub trait RealNumber: Float + FromPrimitive + Debug + Display + Copy + Sum + Pro
3232
fn square(self) -> Self {
3333
self * self
3434
}
35+
36+
/// Raw transmutation to u64
37+
fn to_f32_bits(self) -> u32;
3538
}
3639

3740
impl RealNumber for f64 {
@@ -69,6 +72,10 @@ impl RealNumber for f64 {
6972
fn half() -> Self {
7073
0.5f64
7174
}
75+
76+
fn to_f32_bits(self) -> u32 {
77+
self.to_bits() as u32
78+
}
7279
}
7380

7481
impl RealNumber for f32 {
@@ -106,6 +113,10 @@ impl RealNumber for f32 {
106113
fn half() -> Self {
107114
0.5f32
108115
}
116+
117+
fn to_f32_bits(self) -> u32 {
118+
self.to_bits()
119+
}
109120
}
110121

111122
#[cfg(test)]

0 commit comments

Comments
 (0)