DarkStarStrix
diff --git a/‎.idea/DataStream.iml
Lines changed: 15 additions & 0 deletions b/‎.idea/DataStream.iml
Lines changed: 15 additions & 0 deletions
diff --git a/‎.idea/modules.xml
Lines changed: 8 additions & 0 deletions b/‎.idea/modules.xml
Lines changed: 8 additions & 0 deletions
diff --git a/‎EDA/EDA.py
Lines changed: 10 additions & 13 deletions b/‎EDA/EDA.py
Lines changed: 10 additions & 13 deletions
diff --git a/‎Example/Basic_Pipeline_Performance.py
Lines changed: 0 additions & 68 deletions b/‎Example/Basic_Pipeline_Performance.py
Lines changed: 0 additions & 68 deletions
diff --git a/‎Example/Data_pipeline.py
Lines changed: 54 additions & 0 deletions b/‎Example/Data_pipeline.py
Lines changed: 54 additions & 0 deletions
@@ -1,40 +1,37 @@
-# eda.py
+# EDA/EDA.py
 
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 
 
 class EDA:
-    def __init__(self, file_path):
-        self.data = pd.read_csv (file_path)
+    def __init__(self, data):
+        self.data = data
 
-    def summary_statistics(self):
+    def get_summary_statistics(self):
         return self.data.describe ()
 
-    def missing_values(self):
+    def get_missing_values(self):
         return self.data.isnull ().sum ()
 
     def plot_histogram(self, column):
-        plt.figure (figsize=(10, 6))
-        sns.histplot (self.data [column], kde=True)
+        self.data [column].hist ()
         plt.title (f'Histogram of {column}')
         plt.show ()
 
     def plot_correlation_matrix(self):
-        plt.figure (figsize=(12, 8))
-        sns.heatmap (self.data.corr (), annot=True, cmap='coolwarm')
+        corr = self.data.corr ()
+        sns.heatmap (corr, annot=True, cmap='coolwarm')
         plt.title ('Correlation Matrix')
         plt.show ()
 
     def plot_scatter(self, column1, column2):
-        plt.figure (figsize=(10, 6))
-        sns.scatterplot (x=self.data [column1], y=self.data [column2])
-        plt.title (f'Scatter Plot of {column1} vs {column2}')
+        self.data.plot.scatter (x=column1, y=column2)
+        plt.title (f'Scatter plot of {column1} vs {column2}')
         plt.show ()
 
     def plot_missing_values(self):
-        plt.figure (figsize=(12, 6))
         sns.heatmap (self.data.isnull (), cbar=False, cmap='viridis')
         plt.title ('Missing Values Heatmap')
         plt.show ()
@@ -0,0 +1,54 @@
+import os
+import pandas as pd
+from Loaders.csv_loader import CSVLoader
+from preprocess.Cleaning import DataCleaner
+from tabulate import tabulate
+
+# Define file paths
+input_file_path = 'C:/Users/kunya/PycharmProjects/DataStream/data/customers-100.csv'
+output_dir = 'C:/Users/kunya/PycharmProjects/DataStream/data/processed_data'
+output_file_path = os.path.join (output_dir, 'processed_customers-100.csv')
+
+# Create the output directory if it doesn't exist
+os.makedirs (output_dir, exist_ok=True)
+
+
+# Function to process a single file
+def process_file(input_file, output_file):
+    # Load data
+    loader = CSVLoader (file_path=input_file)
+    data = loader.load_data ()
+
+    # Print the original dataset
+    print ("Original Dataset:")
+    print (data)
+
+    # Create preprocessing steps
+    cleaner = DataCleaner (missing_value_strategy='fill')
+
+    # Clean the data
+    cleaned_data = cleaner.transform (data)
+
+    # Exclude columns with all zero values
+    cleaned_data = cleaned_data.loc [:, (cleaned_data != 0).any (axis=0)]
+
+    # Generate summary statistics for all columns
+    summary_stats = cleaned_data.describe (include='all').transpose ()
+
+    # Save summary statistics to a text file
+    with open (os.path.join (output_dir, 'summary_statistics.txt'), 'w') as f:
+        f.write ("Summary Statistics:\n")
+        f.write (tabulate (summary_stats, headers='keys', tablefmt='grid'))
+
+    # Print the processed data to the console
+    print ("Cleaned Data:")
+    print (cleaned_data)
+
+    # Save the cleaned data
+    cleaned_data.to_csv (output_file, index=False)
+
+
+# Process the file
+process_file (input_file_path, output_file_path)
+
+print ("Data processing complete. Processed file and summary statistics are saved in the 'processed_data' folder.")