diff --git a/code/matplotlib_lessons.py b/code/matplotlib_lessons.py index c3271a8..73ac41b 100644 --- a/code/matplotlib_lessons.py +++ b/code/matplotlib_lessons.py @@ -6,7 +6,7 @@ # # ## The importance of communicating your results # -# So, you've done your preprocessing, and you've doen your analysis. Now, you have to communicate your results. Often, you're going to be communicating with people who aren't programmers. Data visualization is a great way to communicate your analyses in clear, meaningful ways. It also allows you to literally see what's going on in your data. (I should note that data visualization is often a great exploratory step, because again, you get to see what's going on in your data.) +# So, you've done your preprocessing, and you've done your analysis. Now, you have to communicate your results. Often, you're going to be communicating with people who aren't programmers. Data visualization is a great way to communicate your analyses in clear, meaningful ways. It also allows you to literally see what's going on in your data. (I should note that data visualization is often a great exploratory step, because again, you get to see what's going on in your data.) # # One of the most important parts of visualization is choosing the best plot for your data. A great visualization shouldn't hide the analysis you've made but enhance it. It doesn't have to fancy or complicated, just clear. # diff --git a/code/pandas_lessons.py b/code/pandas_lessons.py index f60e671..8eaac7d 100644 --- a/code/pandas_lessons.py +++ b/code/pandas_lessons.py @@ -88,7 +88,7 @@ #print combine_series -# Notice how in column 'b', we have two kinds of data. If a column in a DataFrame contains multiple types of data, the data type (or dtype) of the column will be chosen to accomodate all of the data. We can look at the data types of different columns with the dtypes attribute. Object is the most general, which is what has been chosen for column 'b'. +# Notice how in column 'b', we have two kinds of data. If a column in a DataFrame contains multiple types of data, the data type (or dtype) of the column will be chosen to accommodate all of the data. We can look at the data types of different columns with the dtypes attribute. Object is the most general, which is what has been chosen for column 'b'. @@ -116,7 +116,7 @@ #print wine.head() -# Reading in a text file is just as easy. Make sure to pass in '\t' to the delimter parameter. +# Reading in a text file is just as easy. Make sure to pass in '\t' to the delimiter parameter. diff --git a/code/scikit_learn_lessons.py b/code/scikit_learn_lessons.py index bc2a428..25d1ff8 100644 --- a/code/scikit_learn_lessons.py +++ b/code/scikit_learn_lessons.py @@ -5,7 +5,7 @@ # # ## A brief intro to machine learning # -# There's a fair bit of backround knowledge that's important to know before we dive into the code. The actual code is rather simple, but I want you to understand exactly what's going on. +# There's a fair bit of background knowledge that's important to know before we dive into the code. The actual code is rather simple, but I want you to understand exactly what's going on. # # ### What is machine learning? # @@ -49,11 +49,11 @@ # # ## k-Nearest Neighbors # -# The k-Nearest Neighbors (kNN) algorithm finds a predetermined number of "neighbor" samples that are closest in distance to a starting data point and makes predictions based on the distances. kNN predicts labels by looking at the labels of its nearest neighbors. The metric used to calcuate the distances between points can be any distance metric measure, such as the Euclidean metric or the Manhattan distance. +# The k-Nearest Neighbors (kNN) algorithm finds a predetermined number of "neighbor" samples that are closest in distance to a starting data point and makes predictions based on the distances. kNN predicts labels by looking at the labels of its nearest neighbors. The metric used to calculate the distances between points can be any distance metric measure, such as the Euclidean metric or the Manhattan distance. # # kNN is useful when your data is linear in nature and can therefore be measured with a distance metric. Also, kNN does well when the decision boundary (or the delineation between classes) is hard to identify. # -# kNN comes with a couple of caveats. If the classes in your dataset are unevenly distributed, the highest-occuring label will tend to dominate predictions. Also, choosing the *k* of kNN can be tricky. Choosing *k* deserves its own three hour tutorial, so we'll just go with the defaults for today. +# kNN comes with a couple of caveats. If the classes in your dataset are unevenly distributed, the highest-occurring label will tend to dominate predictions. Also, choosing the *k* of kNN can be tricky. Choosing *k* deserves its own three hour tutorial, so we'll just go with the defaults for today. # # ### Classifying in scikit-learn: kNN #