prakharrathi25 · SourceryAI · Jun 17, 2022 · SourceryAI · Jun 17, 2022 · SourceryAI
diff --git a/pages/data_upload.py b/pages/data_upload.py
@@ -20,33 +20,33 @@ def app():
         except Exception as e:
             print(e)
             data = pd.read_excel(uploaded_file)
-    
-    
-    
+
+
+
     # uploaded_files = st.file_uploader("Upload your CSV file here.", type='csv', accept_multiple_files=False)
     # # Check if file exists 
     # if uploaded_files:
     #     for file in uploaded_files:
     #         file.seek(0)
     #     uploaded_data_read = [pd.read_csv(file) for file in uploaded_files]
     #     raw_data = pd.concat(uploaded_data_read)
-    
+
     # uploaded_files = st.file_uploader("Upload CSV", type="csv", accept_multiple_files=False)
     # print(uploaded_files, type(uploaded_files))
     # if uploaded_files:
     #     for file in uploaded_files:
     #         file.seek(0)
     #     uploaded_data_read = [pd.read_csv(file) for file in uploaded_files]
     #     raw_data = pd.concat(uploaded_data_read)
-    
+
     # read temp data 
     # data = pd.read_csv('data/2015.csv')
 
 
     ''' Load the data and save the columns with categories as a dataframe. 
     This section also allows changes in the numerical and categorical columns. '''
     if st.button("Load Data"):
-        
+
         # Raw data 
         st.dataframe(data)
         #utils.getProfile(data)
@@ -55,7 +55,7 @@ def app():
         #source_code = HtmlFile.read() 
         #components.iframe("data/output.html")# Save the data to a new file 
         data.to_csv('data/main_data.csv', index=False)
-        
+
         #Generate a pandas profiling report
         #if st.button("Generate an analysis report"):
         #    utils.getProfile(data)
@@ -64,16 +64,16 @@ def app():
         # 	pass
 
         # Collect the categorical and numerical columns 
-        
+
         numeric_cols = data.select_dtypes(include=np.number).columns.tolist()
         categorical_cols = list(set(list(data.columns)) - set(numeric_cols))
-        
+
         # Save the columns as a dataframe or dictionary
         columns = []
 
         # Iterate through the numerical and categorical columns and save in columns 
         columns = utils.genMetaData(data) 
-        
+
         # Save the columns as a dataframe with categories
         # Here column_name is the name of the field and the type is whether it's numerical or categorical
         columns_df = pd.DataFrame(columns, columns = ['column_name', 'type'])
@@ -83,6 +83,6 @@ def app():
         st.markdown("**Column Name**-**Type**")
         for i in range(columns_df.shape[0]):
             st.write(f"{i+1}. **{columns_df.iloc[i]['column_name']}** - {columns_df.iloc[i]['type']}")
-        
+
         st.markdown("""The above are the automated column types detected by the application in the data. 
         In case you wish to change the column types, head over to the **Column Change** section. """)
diff --git a/pages/data_visualize.py b/pages/data_visualize.py
@@ -22,8 +22,8 @@ def app():
         for i in range(len(Categorical)):
                 unique_Category_val = {Categorical[i]: utils.mapunique(df_analysis, Categorical[i])}
                 cat_groups = {Categorical[i]: df_visual.groupby(Categorical[i])}
-                
-        
+
+
         category = st.selectbox("Select Category ", Categorical + Object)
 
         sizes = (df_visual[category].value_counts()/df_visual[category].count())
@@ -34,15 +34,15 @@ def app():
         explode = [0]*len(labels)
         explode[int(maxIndex)] = 0.1
         explode = tuple(explode)
-        
+
         fig1, ax1 = plt.subplots()
         ax1.pie(sizes,explode = explode, labels=labels, autopct='%1.1f%%',shadow=False, startangle=0)
         ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
-        ax1.set_title('Distribution for Categorical Column - ' + (str)(category))
+        ax1.set_title(f'Distribution for Categorical Column - {str(category)}')
         st.pyplot(fig1)
-        
+
         corr = df_analysis.corr(method='pearson')
-        
+
         fig2, ax2 = plt.subplots()
         mask = np.zeros_like(corr, dtype=np.bool)
         mask[np.triu_indices_from(mask)] = True
@@ -51,9 +51,12 @@ def app():
         sns.heatmap(corr, mask=mask, linewidths=.5, cmap=cmap, center=0,ax=ax2)
         ax2.set_title("Correlation Matrix")
         st.pyplot(fig2)
-
-
-        categoryObject=st.selectbox("Select " + (str)(category),unique_Category_val[category])
+
+
+        categoryObject = st.selectbox(
+            f"Select {str(category)}", unique_Category_val[category]
+        )
+
         st.write(cat_groups[category].get_group(categoryObject).describe())
         colName = st.selectbox("Select Column ",Numerical)
 

diff --git a/pages/machine_learning.py b/pages/machine_learning.py
@@ -20,7 +20,7 @@ def app():
     by the user. It runs some basic models and let's the user select the X and y variables. 
     """
 
-    # Load the data 
+    # Load the data
     if 'main_data.csv' not in os.listdir('data'):
         st.markdown("Please upload data through `Upload Data` page!")
     else:
@@ -62,28 +62,28 @@ def app():
 
         st.write(f"**Variable to be predicted:** {y_var}")
         st.write(f"**Variable to be used for prediction:** {X_var}")
-        
+
         # Divide the data into test and train set 
         X = data[X_var]
         y = data[y_var]
 
         # Perform data imputation 
         # st.write("THIS IS WHERE DATA IMPUTATION WILL HAPPEN")
-        
+
         # Perform encoding
         X = pd.get_dummies(X)
 
         # Check if y needs to be encoded
         if not isNumerical(y):
             le = LabelEncoder()
             y = le.fit_transform(y)
-            
+
             # Print all the classes 
             st.write("The classes and the class allotted to them is the following:-")
             classes = list(le.classes_)
             for i in range(len(classes)):
                 st.write(f"{classes[i]} --> {i}")
-        
+
 
         # Perform train test splits 
         st.markdown("#### Train Test Splitting")
@@ -106,15 +106,11 @@ def app():
         if pred_type == "Regression":
             st.write("Running Regression Models on Sample")
 
-            # Table to store model and accurcy 
-            model_r2 = []
-
             # Linear regression model 
             lr_model = LinearRegression()
             lr_model.fit(X_train, y_train)
             lr_r2 = lr_model.score(X_test, y_test)
-            model_r2.append(['Linear Regression', lr_r2])
-
+            model_r2 = [['Linear Regression', lr_r2]]
             # Decision Tree model 
             dt_model = DecisionTreeRegressor()
             dt_model.fit(X_train, y_train)
@@ -131,19 +127,15 @@ def app():
             # Make a dataframe of results 
             results = pd.DataFrame(model_r2, columns=['Models', 'R2 Score']).sort_values(by='R2 Score', ascending=False)
             st.dataframe(results)
-        
+
         if pred_type == "Classification":
             st.write("Running Classfication Models on Sample")
 
-            # Table to store model and accurcy 
-            model_acc = []
-
             # Linear regression model 
             lc_model = LogisticRegression()
             lc_model.fit(X_train, y_train)
             lc_acc = lc_model.score(X_test, y_test)
-            model_acc.append(['Linear Regression', lc_acc])
-
+            model_acc = [['Linear Regression', lc_acc]]
             # Decision Tree model 
             dtc_model = DecisionTreeClassifier()
             dtc_model.fit(X_train, y_train)

diff --git a/pages/metadata.py b/pages/metadata.py
@@ -23,32 +23,32 @@ def app():
             The page is divided into two columns using beta columns 
         '''
         st.markdown("#### Change the information about column types")
-        
+
         # Use two column technique 
         col1, col2 = st.beta_columns(2)
 
         global name, type
         # Design column 1 
         name = col1.selectbox("Select Column", data.columns)
-        
+
         # Design column two 
         current_type = col_metadata[col_metadata['column_name'] == name]['type'].values[0]
         print(current_type)
         column_options = ['numeric', 'categorical','object']
         current_index = column_options.index(current_type)
-        
+
         type = col2.selectbox("Select Column Type", options=column_options, index = current_index)
-        
+
         st.write("""Select your column name and the new type from the data.
                     To submit all the changes, click on *Submit changes* """)
 
-        
+
         if st.button("Change Column Type"): 
 
             # Set the value in the metadata and resave the file 
             # col_metadata = pd.read_csv('data/metadata/column_type_desc.csv')
             st.dataframe(col_metadata[col_metadata['column_name'] == name])
-            
+
             col_metadata.loc[col_metadata['column_name'] == name, 'type'] = type
             col_metadata.to_csv('data/metadata/column_type_desc.csv', index = False)
 

diff --git a/pages/modelUtils.py b/pages/modelUtils.py
@@ -4,29 +4,24 @@
 from sklearn import linear_model
 
 def checkDir():
-	if 'models' in os.listdir('../'): 
-		return True
-	return False
+	return 'models' in os.listdir('../')
 
 def makeDir():
-	if checkDir(): 
-		pass
-	else: 
+	if not checkDir():
 		os.mkdir('../models')
 
 # will save a model at ../models and will return the location+name of saved model
 def saveModel(modelClass, name = None):
 	fileName = name
-	if name is None: 
+	if fileName is None: 
 		fileName = 'model'+str(len(os.listdir('../models')))
 	fileName+='.sav'
-	pickle.dump(modelClass, open('../models/'+fileName, 'wb'))
-	return '../models/'+fileName
+	pickle.dump(modelClass, open(f'../models/{fileName}', 'wb'))
+	return f'../models/{fileName}'
 
 # model will be loaded through the location of model that is returned from the 
 def loadModel(fileName):
-	model = pickle.load(open(fileName, 'rb'))
-	return model
+	return pickle.load(open(fileName, 'rb'))
 
 ### All the below tests passed
 if __name__ == '__main__':
@@ -36,6 +31,6 @@ def loadModel(fileName):
 	reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
 	print("og Coeff: ",reg.coef_)
 	path = saveModel(reg)
-	print("Model Name: "+path)
+	print(f"Model Name: {path}")
 	model = loadModel(path)
 	print("Loaded Model:", model.coef_)
diff --git a/pages/redundant.py b/pages/redundant.py
@@ -5,7 +5,7 @@
 import os
 
 def app():
-	
+
 	if 'main_data.csv' not in os.listdir('data'):
 		st.markdown("Please upload data through `Upload Data` page!")
 	else:

diff --git a/pages/utils.py b/pages/utils.py
@@ -39,22 +39,20 @@ def genMetaData(df):
         if isCategorical(df[col[i]]):
             ColumnType.append((col[i],"categorical"))
             Categorical.append(col[i])
-        
+
         elif is_numeric_dtype(df[col[i]]):
             ColumnType.append((col[i],"numerical"))
             Numerical.append(col[i])
-        
+
         else:
             ColumnType.append((col[i],"object"))
             Object.append(col[i])
 
     return ColumnType
 
 def makeMapDict(col): 
-    uniqueVals = list(np.unique(col))
-    uniqueVals.sort()
-    dict_ = {uniqueVals[i]: i for i in range(len(uniqueVals))}
-    return dict_
+    uniqueVals = sorted(np.unique(col))
+    return {uniqueVals[i]: i for i in range(len(uniqueVals))}
 
 def mapunique(df, colName):
     dict_ = makeMapDict(df[colName])
@@ -66,16 +64,11 @@ def mapunique(df, colName):
 ## For redundant columns
 def getRedundentColumns(corr, y: str, threshold =0.1): 
     cols = corr.columns
-    redunt = []
     k = 0
-    for ind, c in enumerate(corr[y]):
-        if c<1-threshold: 
-            redunt.append(cols[ind])
-    return redunt
+    return [cols[ind] for ind, c in enumerate(corr[y]) if c<1-threshold]
 
 def newDF(df, columns2Drop):
-    newDF = df.drop(columns2Drop, axis = 'columns')
-    return newDF
+    return df.drop(columns2Drop, axis = 'columns')
 
 if __name__ == '__main__':
     df = {"Name": ["salil", "saxena", "for", "int"]}