Skip to content

Commit cfabbfb

Browse files
committed
Example Datapipeline
Signed-off-by: Darkstalker <allanw.mk@gmail.com>
1 parent 483bdc3 commit cfabbfb

File tree

9 files changed

+293
-87
lines changed

9 files changed

+293
-87
lines changed

.idea/DataStream.iml

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

EDA/EDA.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,37 @@
1-
# eda.py
1+
# EDA/EDA.py
22

33
import pandas as pd
44
import matplotlib.pyplot as plt
55
import seaborn as sns
66

77

88
class EDA:
9-
def __init__(self, file_path):
10-
self.data = pd.read_csv (file_path)
9+
def __init__(self, data):
10+
self.data = data
1111

12-
def summary_statistics(self):
12+
def get_summary_statistics(self):
1313
return self.data.describe ()
1414

15-
def missing_values(self):
15+
def get_missing_values(self):
1616
return self.data.isnull ().sum ()
1717

1818
def plot_histogram(self, column):
19-
plt.figure (figsize=(10, 6))
20-
sns.histplot (self.data [column], kde=True)
19+
self.data [column].hist ()
2120
plt.title (f'Histogram of {column}')
2221
plt.show ()
2322

2423
def plot_correlation_matrix(self):
25-
plt.figure (figsize=(12, 8))
26-
sns.heatmap (self.data.corr (), annot=True, cmap='coolwarm')
24+
corr = self.data.corr ()
25+
sns.heatmap (corr, annot=True, cmap='coolwarm')
2726
plt.title ('Correlation Matrix')
2827
plt.show ()
2928

3029
def plot_scatter(self, column1, column2):
31-
plt.figure (figsize=(10, 6))
32-
sns.scatterplot (x=self.data [column1], y=self.data [column2])
33-
plt.title (f'Scatter Plot of {column1} vs {column2}')
30+
self.data.plot.scatter (x=column1, y=column2)
31+
plt.title (f'Scatter plot of {column1} vs {column2}')
3432
plt.show ()
3533

3634
def plot_missing_values(self):
37-
plt.figure (figsize=(12, 6))
3835
sns.heatmap (self.data.isnull (), cbar=False, cmap='viridis')
3936
plt.title ('Missing Values Heatmap')
4037
plt.show ()

Example/Basic_Pipeline_Performance.py

Lines changed: 0 additions & 68 deletions
This file was deleted.

Example/Data_pipeline.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import os
2+
import pandas as pd
3+
from Loaders.csv_loader import CSVLoader
4+
from preprocess.Cleaning import DataCleaner
5+
from tabulate import tabulate
6+
7+
# Define file paths
8+
input_file_path = 'C:/Users/kunya/PycharmProjects/DataStream/data/customers-100.csv'
9+
output_dir = 'C:/Users/kunya/PycharmProjects/DataStream/data/processed_data'
10+
output_file_path = os.path.join (output_dir, 'processed_customers-100.csv')
11+
12+
# Create the output directory if it doesn't exist
13+
os.makedirs (output_dir, exist_ok=True)
14+
15+
16+
# Function to process a single file
17+
def process_file(input_file, output_file):
18+
# Load data
19+
loader = CSVLoader (file_path=input_file)
20+
data = loader.load_data ()
21+
22+
# Print the original dataset
23+
print ("Original Dataset:")
24+
print (data)
25+
26+
# Create preprocessing steps
27+
cleaner = DataCleaner (missing_value_strategy='fill')
28+
29+
# Clean the data
30+
cleaned_data = cleaner.transform (data)
31+
32+
# Exclude columns with all zero values
33+
cleaned_data = cleaned_data.loc [:, (cleaned_data != 0).any (axis=0)]
34+
35+
# Generate summary statistics for all columns
36+
summary_stats = cleaned_data.describe (include='all').transpose ()
37+
38+
# Save summary statistics to a text file
39+
with open (os.path.join (output_dir, 'summary_statistics.txt'), 'w') as f:
40+
f.write ("Summary Statistics:\n")
41+
f.write (tabulate (summary_stats, headers='keys', tablefmt='grid'))
42+
43+
# Print the processed data to the console
44+
print ("Cleaned Data:")
45+
print (cleaned_data)
46+
47+
# Save the cleaned data
48+
cleaned_data.to_csv (output_file, index=False)
49+
50+
51+
# Process the file
52+
process_file (input_file_path, output_file_path)
53+
54+
print ("Data processing complete. Processed file and summary statistics are saved in the 'processed_data' folder.")

0 commit comments

Comments
 (0)