Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions pages/data_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,33 +20,33 @@ def app():
except Exception as e:
print(e)
data = pd.read_excel(uploaded_file)



Comment on lines -23 to +25
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found the following improvement in Function app:

# uploaded_files = st.file_uploader("Upload your CSV file here.", type='csv', accept_multiple_files=False)
# # Check if file exists
# if uploaded_files:
# for file in uploaded_files:
# file.seek(0)
# uploaded_data_read = [pd.read_csv(file) for file in uploaded_files]
# raw_data = pd.concat(uploaded_data_read)

# uploaded_files = st.file_uploader("Upload CSV", type="csv", accept_multiple_files=False)
# print(uploaded_files, type(uploaded_files))
# if uploaded_files:
# for file in uploaded_files:
# file.seek(0)
# uploaded_data_read = [pd.read_csv(file) for file in uploaded_files]
# raw_data = pd.concat(uploaded_data_read)

# read temp data
# data = pd.read_csv('data/2015.csv')


''' Load the data and save the columns with categories as a dataframe.
This section also allows changes in the numerical and categorical columns. '''
if st.button("Load Data"):

# Raw data
st.dataframe(data)
#utils.getProfile(data)
Expand All @@ -55,7 +55,7 @@ def app():
#source_code = HtmlFile.read()
#components.iframe("data/output.html")# Save the data to a new file
data.to_csv('data/main_data.csv', index=False)

#Generate a pandas profiling report
#if st.button("Generate an analysis report"):
# utils.getProfile(data)
Expand All @@ -64,16 +64,16 @@ def app():
# pass

# Collect the categorical and numerical columns

numeric_cols = data.select_dtypes(include=np.number).columns.tolist()
categorical_cols = list(set(list(data.columns)) - set(numeric_cols))

# Save the columns as a dataframe or dictionary
columns = []

# Iterate through the numerical and categorical columns and save in columns
columns = utils.genMetaData(data)

# Save the columns as a dataframe with categories
# Here column_name is the name of the field and the type is whether it's numerical or categorical
columns_df = pd.DataFrame(columns, columns = ['column_name', 'type'])
Expand All @@ -83,6 +83,6 @@ def app():
st.markdown("**Column Name**-**Type**")
for i in range(columns_df.shape[0]):
st.write(f"{i+1}. **{columns_df.iloc[i]['column_name']}** - {columns_df.iloc[i]['type']}")

st.markdown("""The above are the automated column types detected by the application in the data.
In case you wish to change the column types, head over to the **Column Change** section. """)
21 changes: 12 additions & 9 deletions pages/data_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def app():
for i in range(len(Categorical)):
unique_Category_val = {Categorical[i]: utils.mapunique(df_analysis, Categorical[i])}
cat_groups = {Categorical[i]: df_visual.groupby(Categorical[i])}


Comment on lines -25 to +26
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function app refactored with the following changes:

category = st.selectbox("Select Category ", Categorical + Object)

sizes = (df_visual[category].value_counts()/df_visual[category].count())
Expand All @@ -34,15 +34,15 @@ def app():
explode = [0]*len(labels)
explode[int(maxIndex)] = 0.1
explode = tuple(explode)

fig1, ax1 = plt.subplots()
ax1.pie(sizes,explode = explode, labels=labels, autopct='%1.1f%%',shadow=False, startangle=0)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title('Distribution for Categorical Column - ' + (str)(category))
ax1.set_title(f'Distribution for Categorical Column - {str(category)}')
st.pyplot(fig1)

corr = df_analysis.corr(method='pearson')

fig2, ax2 = plt.subplots()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
Expand All @@ -51,9 +51,12 @@ def app():
sns.heatmap(corr, mask=mask, linewidths=.5, cmap=cmap, center=0,ax=ax2)
ax2.set_title("Correlation Matrix")
st.pyplot(fig2)


categoryObject=st.selectbox("Select " + (str)(category),unique_Category_val[category])


categoryObject = st.selectbox(
f"Select {str(category)}", unique_Category_val[category]
)

st.write(cat_groups[category].get_group(categoryObject).describe())
colName = st.selectbox("Select Column ",Numerical)

Expand Down
24 changes: 8 additions & 16 deletions pages/machine_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def app():
by the user. It runs some basic models and let's the user select the X and y variables.
"""

# Load the data
# Load the data
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function app refactored with the following changes:

This removes the following comments ( why? ):

# Table to store model and accurcy

if 'main_data.csv' not in os.listdir('data'):
st.markdown("Please upload data through `Upload Data` page!")
else:
Expand Down Expand Up @@ -62,28 +62,28 @@ def app():

st.write(f"**Variable to be predicted:** {y_var}")
st.write(f"**Variable to be used for prediction:** {X_var}")

# Divide the data into test and train set
X = data[X_var]
y = data[y_var]

# Perform data imputation
# st.write("THIS IS WHERE DATA IMPUTATION WILL HAPPEN")

# Perform encoding
X = pd.get_dummies(X)

# Check if y needs to be encoded
if not isNumerical(y):
le = LabelEncoder()
y = le.fit_transform(y)

# Print all the classes
st.write("The classes and the class allotted to them is the following:-")
classes = list(le.classes_)
for i in range(len(classes)):
st.write(f"{classes[i]} --> {i}")


# Perform train test splits
st.markdown("#### Train Test Splitting")
Expand All @@ -106,15 +106,11 @@ def app():
if pred_type == "Regression":
st.write("Running Regression Models on Sample")

# Table to store model and accurcy
model_r2 = []

# Linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_r2 = lr_model.score(X_test, y_test)
model_r2.append(['Linear Regression', lr_r2])

model_r2 = [['Linear Regression', lr_r2]]
# Decision Tree model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
Expand All @@ -131,19 +127,15 @@ def app():
# Make a dataframe of results
results = pd.DataFrame(model_r2, columns=['Models', 'R2 Score']).sort_values(by='R2 Score', ascending=False)
st.dataframe(results)

if pred_type == "Classification":
st.write("Running Classfication Models on Sample")

# Table to store model and accurcy
model_acc = []

# Linear regression model
lc_model = LogisticRegression()
lc_model.fit(X_train, y_train)
lc_acc = lc_model.score(X_test, y_test)
model_acc.append(['Linear Regression', lc_acc])

model_acc = [['Linear Regression', lc_acc]]
# Decision Tree model
dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train, y_train)
Expand Down
12 changes: 6 additions & 6 deletions pages/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,32 @@ def app():
The page is divided into two columns using beta columns
'''
st.markdown("#### Change the information about column types")

# Use two column technique
col1, col2 = st.beta_columns(2)

global name, type
# Design column 1
name = col1.selectbox("Select Column", data.columns)

# Design column two
current_type = col_metadata[col_metadata['column_name'] == name]['type'].values[0]
print(current_type)
column_options = ['numeric', 'categorical','object']
current_index = column_options.index(current_type)

type = col2.selectbox("Select Column Type", options=column_options, index = current_index)

st.write("""Select your column name and the new type from the data.
To submit all the changes, click on *Submit changes* """)


if st.button("Change Column Type"):

# Set the value in the metadata and resave the file
# col_metadata = pd.read_csv('data/metadata/column_type_desc.csv')
st.dataframe(col_metadata[col_metadata['column_name'] == name])

Comment on lines -26 to +51
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found the following improvement in Function app:

col_metadata.loc[col_metadata['column_name'] == name, 'type'] = type
col_metadata.to_csv('data/metadata/column_type_desc.csv', index = False)

Expand Down
19 changes: 7 additions & 12 deletions pages/modelUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,24 @@
from sklearn import linear_model

def checkDir():
if 'models' in os.listdir('../'):
return True
return False
return 'models' in os.listdir('../')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function checkDir refactored with the following changes:


def makeDir():
if checkDir():
pass
else:
if not checkDir():
Comment on lines -12 to +10
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function makeDir refactored with the following changes:

os.mkdir('../models')

# will save a model at ../models and will return the location+name of saved model
def saveModel(modelClass, name = None):
fileName = name
if name is None:
if fileName is None:
fileName = 'model'+str(len(os.listdir('../models')))
fileName+='.sav'
pickle.dump(modelClass, open('../models/'+fileName, 'wb'))
return '../models/'+fileName
pickle.dump(modelClass, open(f'../models/{fileName}', 'wb'))
return f'../models/{fileName}'
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function saveModel refactored with the following changes:


# model will be loaded through the location of model that is returned from the
def loadModel(fileName):
model = pickle.load(open(fileName, 'rb'))
return model
return pickle.load(open(fileName, 'rb'))
Comment on lines -28 to +24
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function loadModel refactored with the following changes:


### All the below tests passed
if __name__ == '__main__':
Expand All @@ -36,6 +31,6 @@ def loadModel(fileName):
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
print("og Coeff: ",reg.coef_)
path = saveModel(reg)
print("Model Name: "+path)
print(f"Model Name: {path}")
Comment on lines -39 to +34
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 39-39 refactored with the following changes:

model = loadModel(path)
print("Loaded Model:", model.coef_)
2 changes: 1 addition & 1 deletion pages/redundant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os

def app():

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found the following improvement in Function app:

if 'main_data.csv' not in os.listdir('data'):
st.markdown("Please upload data through `Upload Data` page!")
else:
Expand Down
19 changes: 6 additions & 13 deletions pages/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,20 @@ def genMetaData(df):
if isCategorical(df[col[i]]):
ColumnType.append((col[i],"categorical"))
Categorical.append(col[i])

elif is_numeric_dtype(df[col[i]]):
ColumnType.append((col[i],"numerical"))
Numerical.append(col[i])

Comment on lines -42 to +46
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found the following improvement in Function genMetaData:

else:
ColumnType.append((col[i],"object"))
Object.append(col[i])

return ColumnType

def makeMapDict(col):
uniqueVals = list(np.unique(col))
uniqueVals.sort()
dict_ = {uniqueVals[i]: i for i in range(len(uniqueVals))}
return dict_
uniqueVals = sorted(np.unique(col))
return {uniqueVals[i]: i for i in range(len(uniqueVals))}
Comment on lines -54 to +55
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function makeMapDict refactored with the following changes:


def mapunique(df, colName):
dict_ = makeMapDict(df[colName])
Expand All @@ -66,16 +64,11 @@ def mapunique(df, colName):
## For redundant columns
def getRedundentColumns(corr, y: str, threshold =0.1):
cols = corr.columns
redunt = []
k = 0
for ind, c in enumerate(corr[y]):
if c<1-threshold:
redunt.append(cols[ind])
return redunt
return [cols[ind] for ind, c in enumerate(corr[y]) if c<1-threshold]
Comment on lines -69 to +68
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function getRedundentColumns refactored with the following changes:


def newDF(df, columns2Drop):
newDF = df.drop(columns2Drop, axis = 'columns')
return newDF
return df.drop(columns2Drop, axis = 'columns')
Comment on lines -77 to +71
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function newDF refactored with the following changes:


if __name__ == '__main__':
df = {"Name": ["salil", "saxena", "for", "int"]}
Expand Down