diff --git a/pages/data_upload.py b/pages/data_upload.py index 96b4591..8de14f4 100644 --- a/pages/data_upload.py +++ b/pages/data_upload.py @@ -20,9 +20,9 @@ def app(): except Exception as e: print(e) data = pd.read_excel(uploaded_file) - - - + + + # uploaded_files = st.file_uploader("Upload your CSV file here.", type='csv', accept_multiple_files=False) # # Check if file exists # if uploaded_files: @@ -30,7 +30,7 @@ def app(): # file.seek(0) # uploaded_data_read = [pd.read_csv(file) for file in uploaded_files] # raw_data = pd.concat(uploaded_data_read) - + # uploaded_files = st.file_uploader("Upload CSV", type="csv", accept_multiple_files=False) # print(uploaded_files, type(uploaded_files)) # if uploaded_files: @@ -38,7 +38,7 @@ def app(): # file.seek(0) # uploaded_data_read = [pd.read_csv(file) for file in uploaded_files] # raw_data = pd.concat(uploaded_data_read) - + # read temp data # data = pd.read_csv('data/2015.csv') @@ -46,7 +46,7 @@ def app(): ''' Load the data and save the columns with categories as a dataframe. This section also allows changes in the numerical and categorical columns. ''' if st.button("Load Data"): - + # Raw data st.dataframe(data) #utils.getProfile(data) @@ -55,7 +55,7 @@ def app(): #source_code = HtmlFile.read() #components.iframe("data/output.html")# Save the data to a new file data.to_csv('data/main_data.csv', index=False) - + #Generate a pandas profiling report #if st.button("Generate an analysis report"): # utils.getProfile(data) @@ -64,16 +64,16 @@ def app(): # pass # Collect the categorical and numerical columns - + numeric_cols = data.select_dtypes(include=np.number).columns.tolist() categorical_cols = list(set(list(data.columns)) - set(numeric_cols)) - + # Save the columns as a dataframe or dictionary columns = [] # Iterate through the numerical and categorical columns and save in columns columns = utils.genMetaData(data) - + # Save the columns as a dataframe with categories # Here column_name is the name of the field and the type is whether it's numerical or categorical columns_df = pd.DataFrame(columns, columns = ['column_name', 'type']) @@ -83,6 +83,6 @@ def app(): st.markdown("**Column Name**-**Type**") for i in range(columns_df.shape[0]): st.write(f"{i+1}. **{columns_df.iloc[i]['column_name']}** - {columns_df.iloc[i]['type']}") - + st.markdown("""The above are the automated column types detected by the application in the data. In case you wish to change the column types, head over to the **Column Change** section. """) \ No newline at end of file diff --git a/pages/data_visualize.py b/pages/data_visualize.py index f418cdf..47ec334 100644 --- a/pages/data_visualize.py +++ b/pages/data_visualize.py @@ -22,8 +22,8 @@ def app(): for i in range(len(Categorical)): unique_Category_val = {Categorical[i]: utils.mapunique(df_analysis, Categorical[i])} cat_groups = {Categorical[i]: df_visual.groupby(Categorical[i])} - - + + category = st.selectbox("Select Category ", Categorical + Object) sizes = (df_visual[category].value_counts()/df_visual[category].count()) @@ -34,15 +34,15 @@ def app(): explode = [0]*len(labels) explode[int(maxIndex)] = 0.1 explode = tuple(explode) - + fig1, ax1 = plt.subplots() ax1.pie(sizes,explode = explode, labels=labels, autopct='%1.1f%%',shadow=False, startangle=0) ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. - ax1.set_title('Distribution for Categorical Column - ' + (str)(category)) + ax1.set_title(f'Distribution for Categorical Column - {str(category)}') st.pyplot(fig1) - + corr = df_analysis.corr(method='pearson') - + fig2, ax2 = plt.subplots() mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True @@ -51,9 +51,12 @@ def app(): sns.heatmap(corr, mask=mask, linewidths=.5, cmap=cmap, center=0,ax=ax2) ax2.set_title("Correlation Matrix") st.pyplot(fig2) - - - categoryObject=st.selectbox("Select " + (str)(category),unique_Category_val[category]) + + + categoryObject = st.selectbox( + f"Select {str(category)}", unique_Category_val[category] + ) + st.write(cat_groups[category].get_group(categoryObject).describe()) colName = st.selectbox("Select Column ",Numerical) diff --git a/pages/machine_learning.py b/pages/machine_learning.py index a5d2973..dd5ccb0 100644 --- a/pages/machine_learning.py +++ b/pages/machine_learning.py @@ -20,7 +20,7 @@ def app(): by the user. It runs some basic models and let's the user select the X and y variables. """ - # Load the data + # Load the data if 'main_data.csv' not in os.listdir('data'): st.markdown("Please upload data through `Upload Data` page!") else: @@ -62,14 +62,14 @@ def app(): st.write(f"**Variable to be predicted:** {y_var}") st.write(f"**Variable to be used for prediction:** {X_var}") - + # Divide the data into test and train set X = data[X_var] y = data[y_var] # Perform data imputation # st.write("THIS IS WHERE DATA IMPUTATION WILL HAPPEN") - + # Perform encoding X = pd.get_dummies(X) @@ -77,13 +77,13 @@ def app(): if not isNumerical(y): le = LabelEncoder() y = le.fit_transform(y) - + # Print all the classes st.write("The classes and the class allotted to them is the following:-") classes = list(le.classes_) for i in range(len(classes)): st.write(f"{classes[i]} --> {i}") - + # Perform train test splits st.markdown("#### Train Test Splitting") @@ -106,15 +106,11 @@ def app(): if pred_type == "Regression": st.write("Running Regression Models on Sample") - # Table to store model and accurcy - model_r2 = [] - # Linear regression model lr_model = LinearRegression() lr_model.fit(X_train, y_train) lr_r2 = lr_model.score(X_test, y_test) - model_r2.append(['Linear Regression', lr_r2]) - + model_r2 = [['Linear Regression', lr_r2]] # Decision Tree model dt_model = DecisionTreeRegressor() dt_model.fit(X_train, y_train) @@ -131,19 +127,15 @@ def app(): # Make a dataframe of results results = pd.DataFrame(model_r2, columns=['Models', 'R2 Score']).sort_values(by='R2 Score', ascending=False) st.dataframe(results) - + if pred_type == "Classification": st.write("Running Classfication Models on Sample") - # Table to store model and accurcy - model_acc = [] - # Linear regression model lc_model = LogisticRegression() lc_model.fit(X_train, y_train) lc_acc = lc_model.score(X_test, y_test) - model_acc.append(['Linear Regression', lc_acc]) - + model_acc = [['Linear Regression', lc_acc]] # Decision Tree model dtc_model = DecisionTreeClassifier() dtc_model.fit(X_train, y_train) diff --git a/pages/metadata.py b/pages/metadata.py index ba2479c..c935439 100644 --- a/pages/metadata.py +++ b/pages/metadata.py @@ -23,32 +23,32 @@ def app(): The page is divided into two columns using beta columns ''' st.markdown("#### Change the information about column types") - + # Use two column technique col1, col2 = st.beta_columns(2) global name, type # Design column 1 name = col1.selectbox("Select Column", data.columns) - + # Design column two current_type = col_metadata[col_metadata['column_name'] == name]['type'].values[0] print(current_type) column_options = ['numeric', 'categorical','object'] current_index = column_options.index(current_type) - + type = col2.selectbox("Select Column Type", options=column_options, index = current_index) - + st.write("""Select your column name and the new type from the data. To submit all the changes, click on *Submit changes* """) - + if st.button("Change Column Type"): # Set the value in the metadata and resave the file # col_metadata = pd.read_csv('data/metadata/column_type_desc.csv') st.dataframe(col_metadata[col_metadata['column_name'] == name]) - + col_metadata.loc[col_metadata['column_name'] == name, 'type'] = type col_metadata.to_csv('data/metadata/column_type_desc.csv', index = False) diff --git a/pages/modelUtils.py b/pages/modelUtils.py index 5c261d0..5328ab0 100644 --- a/pages/modelUtils.py +++ b/pages/modelUtils.py @@ -4,29 +4,24 @@ from sklearn import linear_model def checkDir(): - if 'models' in os.listdir('../'): - return True - return False + return 'models' in os.listdir('../') def makeDir(): - if checkDir(): - pass - else: + if not checkDir(): os.mkdir('../models') # will save a model at ../models and will return the location+name of saved model def saveModel(modelClass, name = None): fileName = name - if name is None: + if fileName is None: fileName = 'model'+str(len(os.listdir('../models'))) fileName+='.sav' - pickle.dump(modelClass, open('../models/'+fileName, 'wb')) - return '../models/'+fileName + pickle.dump(modelClass, open(f'../models/{fileName}', 'wb')) + return f'../models/{fileName}' # model will be loaded through the location of model that is returned from the def loadModel(fileName): - model = pickle.load(open(fileName, 'rb')) - return model + return pickle.load(open(fileName, 'rb')) ### All the below tests passed if __name__ == '__main__': @@ -36,6 +31,6 @@ def loadModel(fileName): reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) print("og Coeff: ",reg.coef_) path = saveModel(reg) - print("Model Name: "+path) + print(f"Model Name: {path}") model = loadModel(path) print("Loaded Model:", model.coef_) diff --git a/pages/redundant.py b/pages/redundant.py index 5716ece..ab18915 100644 --- a/pages/redundant.py +++ b/pages/redundant.py @@ -5,7 +5,7 @@ import os def app(): - + if 'main_data.csv' not in os.listdir('data'): st.markdown("Please upload data through `Upload Data` page!") else: diff --git a/pages/utils.py b/pages/utils.py index e0d7421..ea2d5dd 100644 --- a/pages/utils.py +++ b/pages/utils.py @@ -39,11 +39,11 @@ def genMetaData(df): if isCategorical(df[col[i]]): ColumnType.append((col[i],"categorical")) Categorical.append(col[i]) - + elif is_numeric_dtype(df[col[i]]): ColumnType.append((col[i],"numerical")) Numerical.append(col[i]) - + else: ColumnType.append((col[i],"object")) Object.append(col[i]) @@ -51,10 +51,8 @@ def genMetaData(df): return ColumnType def makeMapDict(col): - uniqueVals = list(np.unique(col)) - uniqueVals.sort() - dict_ = {uniqueVals[i]: i for i in range(len(uniqueVals))} - return dict_ + uniqueVals = sorted(np.unique(col)) + return {uniqueVals[i]: i for i in range(len(uniqueVals))} def mapunique(df, colName): dict_ = makeMapDict(df[colName]) @@ -66,16 +64,11 @@ def mapunique(df, colName): ## For redundant columns def getRedundentColumns(corr, y: str, threshold =0.1): cols = corr.columns - redunt = [] k = 0 - for ind, c in enumerate(corr[y]): - if c<1-threshold: - redunt.append(cols[ind]) - return redunt + return [cols[ind] for ind, c in enumerate(corr[y]) if c<1-threshold] def newDF(df, columns2Drop): - newDF = df.drop(columns2Drop, axis = 'columns') - return newDF + return df.drop(columns2Drop, axis = 'columns') if __name__ == '__main__': df = {"Name": ["salil", "saxena", "for", "int"]}