-
Notifications
You must be signed in to change notification settings - Fork 62
Sourcery Starbot ⭐ refactored prakharrathi25/data-storyteller #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,8 +22,8 @@ def app(): | |
| for i in range(len(Categorical)): | ||
| unique_Category_val = {Categorical[i]: utils.mapunique(df_analysis, Categorical[i])} | ||
| cat_groups = {Categorical[i]: df_visual.groupby(Categorical[i])} | ||
|
|
||
|
|
||
|
Comment on lines
-25
to
+26
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| category = st.selectbox("Select Category ", Categorical + Object) | ||
|
|
||
| sizes = (df_visual[category].value_counts()/df_visual[category].count()) | ||
|
|
@@ -34,15 +34,15 @@ def app(): | |
| explode = [0]*len(labels) | ||
| explode[int(maxIndex)] = 0.1 | ||
| explode = tuple(explode) | ||
|
|
||
| fig1, ax1 = plt.subplots() | ||
| ax1.pie(sizes,explode = explode, labels=labels, autopct='%1.1f%%',shadow=False, startangle=0) | ||
| ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | ||
| ax1.set_title('Distribution for Categorical Column - ' + (str)(category)) | ||
| ax1.set_title(f'Distribution for Categorical Column - {str(category)}') | ||
| st.pyplot(fig1) | ||
|
|
||
| corr = df_analysis.corr(method='pearson') | ||
|
|
||
| fig2, ax2 = plt.subplots() | ||
| mask = np.zeros_like(corr, dtype=np.bool) | ||
| mask[np.triu_indices_from(mask)] = True | ||
|
|
@@ -51,9 +51,12 @@ def app(): | |
| sns.heatmap(corr, mask=mask, linewidths=.5, cmap=cmap, center=0,ax=ax2) | ||
| ax2.set_title("Correlation Matrix") | ||
| st.pyplot(fig2) | ||
|
|
||
|
|
||
| categoryObject=st.selectbox("Select " + (str)(category),unique_Category_val[category]) | ||
|
|
||
|
|
||
| categoryObject = st.selectbox( | ||
| f"Select {str(category)}", unique_Category_val[category] | ||
| ) | ||
|
|
||
| st.write(cat_groups[category].get_group(categoryObject).describe()) | ||
| colName = st.selectbox("Select Column ",Numerical) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ def app(): | |
| by the user. It runs some basic models and let's the user select the X and y variables. | ||
| """ | ||
|
|
||
| # Load the data | ||
| # Load the data | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ): |
||
| if 'main_data.csv' not in os.listdir('data'): | ||
| st.markdown("Please upload data through `Upload Data` page!") | ||
| else: | ||
|
|
@@ -62,28 +62,28 @@ def app(): | |
|
|
||
| st.write(f"**Variable to be predicted:** {y_var}") | ||
| st.write(f"**Variable to be used for prediction:** {X_var}") | ||
|
|
||
| # Divide the data into test and train set | ||
| X = data[X_var] | ||
| y = data[y_var] | ||
|
|
||
| # Perform data imputation | ||
| # st.write("THIS IS WHERE DATA IMPUTATION WILL HAPPEN") | ||
|
|
||
| # Perform encoding | ||
| X = pd.get_dummies(X) | ||
|
|
||
| # Check if y needs to be encoded | ||
| if not isNumerical(y): | ||
| le = LabelEncoder() | ||
| y = le.fit_transform(y) | ||
|
|
||
| # Print all the classes | ||
| st.write("The classes and the class allotted to them is the following:-") | ||
| classes = list(le.classes_) | ||
| for i in range(len(classes)): | ||
| st.write(f"{classes[i]} --> {i}") | ||
|
|
||
|
|
||
| # Perform train test splits | ||
| st.markdown("#### Train Test Splitting") | ||
|
|
@@ -106,15 +106,11 @@ def app(): | |
| if pred_type == "Regression": | ||
| st.write("Running Regression Models on Sample") | ||
|
|
||
| # Table to store model and accurcy | ||
| model_r2 = [] | ||
|
|
||
| # Linear regression model | ||
| lr_model = LinearRegression() | ||
| lr_model.fit(X_train, y_train) | ||
| lr_r2 = lr_model.score(X_test, y_test) | ||
| model_r2.append(['Linear Regression', lr_r2]) | ||
|
|
||
| model_r2 = [['Linear Regression', lr_r2]] | ||
| # Decision Tree model | ||
| dt_model = DecisionTreeRegressor() | ||
| dt_model.fit(X_train, y_train) | ||
|
|
@@ -131,19 +127,15 @@ def app(): | |
| # Make a dataframe of results | ||
| results = pd.DataFrame(model_r2, columns=['Models', 'R2 Score']).sort_values(by='R2 Score', ascending=False) | ||
| st.dataframe(results) | ||
|
|
||
| if pred_type == "Classification": | ||
| st.write("Running Classfication Models on Sample") | ||
|
|
||
| # Table to store model and accurcy | ||
| model_acc = [] | ||
|
|
||
| # Linear regression model | ||
| lc_model = LogisticRegression() | ||
| lc_model.fit(X_train, y_train) | ||
| lc_acc = lc_model.score(X_test, y_test) | ||
| model_acc.append(['Linear Regression', lc_acc]) | ||
|
|
||
| model_acc = [['Linear Regression', lc_acc]] | ||
| # Decision Tree model | ||
| dtc_model = DecisionTreeClassifier() | ||
| dtc_model.fit(X_train, y_train) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,32 +23,32 @@ def app(): | |
| The page is divided into two columns using beta columns | ||
| ''' | ||
| st.markdown("#### Change the information about column types") | ||
|
|
||
| # Use two column technique | ||
| col1, col2 = st.beta_columns(2) | ||
|
|
||
| global name, type | ||
| # Design column 1 | ||
| name = col1.selectbox("Select Column", data.columns) | ||
|
|
||
| # Design column two | ||
| current_type = col_metadata[col_metadata['column_name'] == name]['type'].values[0] | ||
| print(current_type) | ||
| column_options = ['numeric', 'categorical','object'] | ||
| current_index = column_options.index(current_type) | ||
|
|
||
| type = col2.selectbox("Select Column Type", options=column_options, index = current_index) | ||
|
|
||
| st.write("""Select your column name and the new type from the data. | ||
| To submit all the changes, click on *Submit changes* """) | ||
|
|
||
|
|
||
| if st.button("Change Column Type"): | ||
|
|
||
| # Set the value in the metadata and resave the file | ||
| # col_metadata = pd.read_csv('data/metadata/column_type_desc.csv') | ||
| st.dataframe(col_metadata[col_metadata['column_name'] == name]) | ||
|
|
||
|
Comment on lines
-26
to
+51
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Found the following improvement in Function |
||
| col_metadata.loc[col_metadata['column_name'] == name, 'type'] = type | ||
| col_metadata.to_csv('data/metadata/column_type_desc.csv', index = False) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,29 +4,24 @@ | |
| from sklearn import linear_model | ||
|
|
||
| def checkDir(): | ||
| if 'models' in os.listdir('../'): | ||
| return True | ||
| return False | ||
| return 'models' in os.listdir('../') | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| def makeDir(): | ||
| if checkDir(): | ||
| pass | ||
| else: | ||
| if not checkDir(): | ||
|
Comment on lines
-12
to
+10
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
| os.mkdir('../models') | ||
|
|
||
| # will save a model at ../models and will return the location+name of saved model | ||
| def saveModel(modelClass, name = None): | ||
| fileName = name | ||
| if name is None: | ||
| if fileName is None: | ||
| fileName = 'model'+str(len(os.listdir('../models'))) | ||
| fileName+='.sav' | ||
| pickle.dump(modelClass, open('../models/'+fileName, 'wb')) | ||
| return '../models/'+fileName | ||
| pickle.dump(modelClass, open(f'../models/{fileName}', 'wb')) | ||
| return f'../models/{fileName}' | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| # model will be loaded through the location of model that is returned from the | ||
| def loadModel(fileName): | ||
| model = pickle.load(open(fileName, 'rb')) | ||
| return model | ||
| return pickle.load(open(fileName, 'rb')) | ||
|
Comment on lines
-28
to
+24
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| ### All the below tests passed | ||
| if __name__ == '__main__': | ||
|
|
@@ -36,6 +31,6 @@ def loadModel(fileName): | |
| reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) | ||
| print("og Coeff: ",reg.coef_) | ||
| path = saveModel(reg) | ||
| print("Model Name: "+path) | ||
| print(f"Model Name: {path}") | ||
|
Comment on lines
-39
to
+34
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
| model = loadModel(path) | ||
| print("Loaded Model:", model.coef_) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ | |
| import os | ||
|
|
||
| def app(): | ||
|
|
||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Found the following improvement in Function |
||
| if 'main_data.csv' not in os.listdir('data'): | ||
| st.markdown("Please upload data through `Upload Data` page!") | ||
| else: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,22 +39,20 @@ def genMetaData(df): | |
| if isCategorical(df[col[i]]): | ||
| ColumnType.append((col[i],"categorical")) | ||
| Categorical.append(col[i]) | ||
|
|
||
| elif is_numeric_dtype(df[col[i]]): | ||
| ColumnType.append((col[i],"numerical")) | ||
| Numerical.append(col[i]) | ||
|
|
||
|
Comment on lines
-42
to
+46
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Found the following improvement in Function |
||
| else: | ||
| ColumnType.append((col[i],"object")) | ||
| Object.append(col[i]) | ||
|
|
||
| return ColumnType | ||
|
|
||
| def makeMapDict(col): | ||
| uniqueVals = list(np.unique(col)) | ||
| uniqueVals.sort() | ||
| dict_ = {uniqueVals[i]: i for i in range(len(uniqueVals))} | ||
| return dict_ | ||
| uniqueVals = sorted(np.unique(col)) | ||
| return {uniqueVals[i]: i for i in range(len(uniqueVals))} | ||
|
Comment on lines
-54
to
+55
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| def mapunique(df, colName): | ||
| dict_ = makeMapDict(df[colName]) | ||
|
|
@@ -66,16 +64,11 @@ def mapunique(df, colName): | |
| ## For redundant columns | ||
| def getRedundentColumns(corr, y: str, threshold =0.1): | ||
| cols = corr.columns | ||
| redunt = [] | ||
| k = 0 | ||
| for ind, c in enumerate(corr[y]): | ||
| if c<1-threshold: | ||
| redunt.append(cols[ind]) | ||
| return redunt | ||
| return [cols[ind] for ind, c in enumerate(corr[y]) if c<1-threshold] | ||
|
Comment on lines
-69
to
+68
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| def newDF(df, columns2Drop): | ||
| newDF = df.drop(columns2Drop, axis = 'columns') | ||
| return newDF | ||
| return df.drop(columns2Drop, axis = 'columns') | ||
|
Comment on lines
-77
to
+71
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
|
||
| if __name__ == '__main__': | ||
| df = {"Name": ["salil", "saxena", "for", "int"]} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found the following improvement in Function
app: