from IPython.display import Image
Image('../images/classification_regression.png', embed=True)


import pandas as pd
original = pd.read_csv('../Datasets/ionosphere.csv')

original.head()


len(original)

351


original['Class'].value_counts()

good    225
bad     126
Name: Class, dtype: int64


original.columns
df = original.drop(['V1','V2'], axis=1)
df.head()


good = df[df.Class=='good']
bad = df[df.Class=='bad']


import matplotlib.pyplot as plt
fig, ax = plt.subplots(2,1)

ax[0].plot(good.iloc[:,:-1]);
ax[1].plot(bad.iloc[:,:-1]);


# simple but not pretty:
# note that a semicolon after the call will prevent printouts
df.boxplot(by="Class");


# a bit better...
df.groupby(['Class']).boxplot(layout=(2,1), rot=90)
plt.tight_layout()

/var/folders/jr/tf9k36ws1956sfj0w8ybnj2m0000gn/T/ipykernel_43733/407500662.py:2: FutureWarning: In a future version of pandas, a length 1 tuple will be returned when iterating over a groupby with a grouper equal to a list of length 1. Don't supply a list with a single grouper to avoid this warning.
  df.groupby(['Class']).boxplot(layout=(2,1), rot=90)


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#make new column of categorical binary outcomes
class_cat = pd.get_dummies(df['Class'],drop_first=True)
#get rid of the old column
df.drop(['Class'],axis=1,inplace=True)
#concatenate the new column to the dataframe
newdf = pd.concat([class_cat,df],axis=1)


newdf.head()


#all rows, first through remaining columns - predictor variables
features = newdf.iloc[:,1:]
#all rows, first column - outcome variable
outcome = newdf.iloc[:,0]


# test size can be a proportion or an integer
feat_train, feat_test, out_train, out_test = train_test_split(features, outcome, test_size = .25, random_state=25)


#have a look
feat_train.head()


len(feat_train)

263


len(out_train)

263


len(feat_test)

88


model = LogisticRegression(solver='lbfgs') #for simple binary logistic regression we can use all default parameters
model.fit(feat_train, out_train)
predictions = model.predict(feat_test)


model.coef_

array([[ 1.90206386,  1.28415539,  1.49901051,  1.18921995,  0.6811774 ,
         1.20243551,  1.17014692, -0.1152072 , -0.65207636, -0.42963413,
        -0.53585875,  1.48519087,  0.44943146, -0.02499059,  0.73960873,
         0.20423089, -0.12822177,  0.08351912,  0.28742275, -1.77467996,
         0.38173375,  0.37707681, -0.18917504,  0.86359342, -1.87976026,
        -0.57361426,  0.78403522,  0.52863255,  0.80008966,  0.27381272,
        -0.49355094, -1.04142605]])


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(out_test, predictions, labels=[0,1])#parameters are y_truth (i.e.,Ground truth (correct) target values), y_pred (i.e., predicted values from model).
cm

array([[17, 11],
       [ 2, 58]])


pd.DataFrame(cm, columns=["bad","good"], index=["bad","good"])


(92+159)/314

0.7993630573248408


92 / (92 + 35)

0.7244094488188977


92 / (92 + 28)

0.7666666666666667


   2 * ((.72 * .77) / (.72 + .77))

0.7441610738255033


iris = pd.read_csv('../Datasets/iris.csv')
len(iris)

150


iris.head()


iris['Species'].unique() #gives the unique list of events in the specified column

array(['setosa', 'versicolor', 'virginica'], dtype=object)


iris.boxplot()

<AxesSubplot: >


iris.corr()

/var/folders/jr/tf9k36ws1956sfj0w8ybnj2m0000gn/T/ipykernel_43733/2141086772.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  iris.corr()


#Here is a handy function, btw
iris.describe()


iris.boxplot(column=['Sepal.Length', 'Sepal.Width', 'Petal.Length','Petal.Width'], by=['Species'], figsize=(10,7));


outcome = iris[['Species']]
predictors = iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length','Petal.Width']]
#predictors.columns = ['Sepal.Length', 'Sepal.Width', 'Petal.Length','Petal.Width']


feat_train, feat_test, out_train, out_test = train_test_split(predictors, outcome, test_size=.33, random_state=5)


#Note we use the same `LogisticRegression` function but change the "class" parameter:

multimodel = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 5000)


multimodel.fit(feat_train, out_train.Species) #note that in the model fit you have to specifiy the column with the category names to train on
model_pred=multimodel.predict(feat_test)


import math
multi_coefficients = pd.DataFrame(multimodel.coef_)
multi_coefficients.columns = predictors.columns
multi_coefficients.index = ['setosa', 'versicolor', 'virginica'] #recall 'unique' function earlier
multi_coefficients

print(multi_coefficients.applymap(math.exp))

            Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
setosa          0.664431     2.332885      0.110197     0.381028
versicolor      1.831750     0.643341      0.851845     0.410404
virginica       0.821644     0.666293     10.652900     6.394865


from sklearn import metrics
iris_matrix = metrics.confusion_matrix(out_test, model_pred)
iris_matrix

array([[16,  0,  0],
       [ 0, 16,  1],
       [ 0,  0, 17]])


pd.DataFrame(iris_matrix, columns=['setosa', 'versicolor', 'virginica'], index=['Predicts_setosa','Predicts_versicolor','Predicts_virginica'])


from sklearn.metrics import classification_report, precision_recall_fscore_support
print(classification_report(out_test, model_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        16
  versicolor       1.00      0.94      0.97        17
   virginica       0.94      1.00      0.97        17

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50


Image('../images/unsupervisedLearning.png', embed=True)

	V1	V3	V4	V5	V6	V7	V8	V9	V10	...	V26	V27	V28	V29	V30	V31	V32	V33	V34	Class
0	1	0.99539	-0.05889	0.85243	0.02306	0.83398	-0.37708	1.00000	0.03760	...	-0.51171	0.41078	-0.46168	0.21266	-0.34090	0.42267	-0.54487	0.18641	-0.45300	good
1	1	1.00000	-0.18829	0.93035	-0.36156	-0.10868	-0.93597	1.00000	-0.04549	...	-0.26569	-0.20468	-0.18401	-0.19040	-0.11593	-0.16626	-0.06288	-0.13738	-0.02447	bad
2	1	1.00000	-0.03365	1.00000	0.00485	1.00000	-0.12062	0.88965	0.01198	...	-0.40220	0.58984	-0.22145	0.43100	-0.17365	0.60436	-0.24180	0.56045	-0.38238	good
3	1	1.00000	-0.45161	1.00000	1.00000	0.71216	-1.00000	0.00000	0.00000	...	0.90695	0.51613	1.00000	1.00000	-0.20099	0.25682	1.00000	-0.32382	1.00000	bad
4	1	1.00000	-0.02401	0.94140	0.06531	0.92106	-0.23255	0.77152	-0.16399	...	-0.65158	0.13290	-0.53206	0.02431	-0.62197	-0.05707	-0.59573	-0.04608	-0.65697	good

	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	...	V26	V27	V28	V29	V30	V31	V32	V33	V34	Class
0	0.99539	-0.05889	0.85243	0.02306	0.83398	-0.37708	1.00000	0.03760	0.85243	-0.17755	...	-0.51171	0.41078	-0.46168	0.21266	-0.34090	0.42267	-0.54487	0.18641	-0.45300	good
1	1.00000	-0.18829	0.93035	-0.36156	-0.10868	-0.93597	1.00000	-0.04549	0.50874	-0.67743	...	-0.26569	-0.20468	-0.18401	-0.19040	-0.11593	-0.16626	-0.06288	-0.13738	-0.02447	bad
2	1.00000	-0.03365	1.00000	0.00485	1.00000	-0.12062	0.88965	0.01198	0.73082	0.05346	...	-0.40220	0.58984	-0.22145	0.43100	-0.17365	0.60436	-0.24180	0.56045	-0.38238	good
3	1.00000	-0.45161	1.00000	1.00000	0.71216	-1.00000	0.00000	0.00000	0.00000	0.00000	...	0.90695	0.51613	1.00000	1.00000	-0.20099	0.25682	1.00000	-0.32382	1.00000	bad
4	1.00000	-0.02401	0.94140	0.06531	0.92106	-0.23255	0.77152	-0.16399	0.52798	-0.20275	...	-0.65158	0.13290	-0.53206	0.02431	-0.62197	-0.05707	-0.59573	-0.04608	-0.65697	good

	good	V3	V4	V5	V6	V7	V8	V9	V10	V11	...	V25	V26	V27	V28	V29	V30	V31	V32	V33	V34
0	1	0.99539	-0.05889	0.85243	0.02306	0.83398	-0.37708	1.00000	0.03760	0.85243	...	0.56811	-0.51171	0.41078	-0.46168	0.21266	-0.34090	0.42267	-0.54487	0.18641	-0.45300
1	0	1.00000	-0.18829	0.93035	-0.36156	-0.10868	-0.93597	1.00000	-0.04549	0.50874	...	-0.20332	-0.26569	-0.20468	-0.18401	-0.19040	-0.11593	-0.16626	-0.06288	-0.13738	-0.02447
2	1	1.00000	-0.03365	1.00000	0.00485	1.00000	-0.12062	0.88965	0.01198	0.73082	...	0.57528	-0.40220	0.58984	-0.22145	0.43100	-0.17365	0.60436	-0.24180	0.56045	-0.38238
3	0	1.00000	-0.45161	1.00000	1.00000	0.71216	-1.00000	0.00000	0.00000	0.00000	...	1.00000	0.90695	0.51613	1.00000	1.00000	-0.20099	0.25682	1.00000	-0.32382	1.00000
4	1	1.00000	-0.02401	0.94140	0.06531	0.92106	-0.23255	0.77152	-0.16399	0.52798	...	0.03286	-0.65158	0.13290	-0.53206	0.02431	-0.62197	-0.05707	-0.59573	-0.04608	-0.65697

	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	...	V25	V26	V27	V28	V29	V30	V31	V32	V33	V34
258	0.61538	0.18923	0.78157	0.01780	0.77486	0.02647	0.65077	-0.10308	0.77538	0.08000	...	0.60154	-0.07231	0.58803	0.08767	0.55077	0.25692	0.53389	0.09207	0.50609	0.09322
13	1.00000	-0.86701	1.00000	0.22280	0.85492	-0.39896	1.00000	-0.12090	1.00000	0.35147	...	1.00000	-0.17012	1.00000	0.35924	1.00000	-0.66494	1.00000	0.88428	1.00000	-0.18826
140	0.94531	-0.03516	-1.00000	-0.33203	-1.00000	-0.01563	0.97266	0.01172	0.93359	-0.01953	...	0.95703	-0.00391	0.82041	0.13758	0.90234	-0.06641	-1.00000	-0.18750	-1.00000	-0.34375
347	0.95113	0.00419	0.95183	-0.02723	0.93438	-0.01920	0.94590	0.01606	0.96510	0.03281	...	0.94520	0.01361	0.93522	0.04925	0.93159	0.08168	0.94066	-0.00035	0.91483	0.04712
230	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000	...	0.00000	0.00000	1.00000	1.00000	0.00000	0.00000	0.00000	0.00000	0.00000	0.00000

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Sepal.Length	1.000000	-0.117570	0.871754	0.817941
Sepal.Width	-0.117570	1.000000	-0.428440	-0.366126
Petal.Length	0.871754	-0.428440	1.000000	0.962865
Petal.Width	0.817941	-0.366126	0.962865	1.000000

Basics of Statistical Modeling¶

Types of Statistical Models¶

Variables: Quantitative vs Categorical¶

Note: Finals Data¶

Binomial Logistic Regression (Classification for only 2 categories)¶

Training a model¶

Splitting Your Data Into Training And Test Sets¶

Supervised learning¶

Step 1: Convert your 'Class' column of data to a single binary variable with zeros and ones.¶

Step 2: Divide your data into two subsets:¶

Step 3: Split your data into training and test sets using the `sklearn` packages.¶

Step 4: Run (i.e., "fit") the model with `LogisticRegression`¶

Step 5: Print your model coefficients¶

Step 6: Compute a confusion matrix showing your model results¶

True positives: 92¶

True negatives: 159¶

False positives: 35¶

False negatives: 28¶

What do we infer from the Confusion Matrix?¶

Precision and Recall¶

Precision =¶

Recall (a.k.a., sensitivity) =¶

F1-measure =¶

Need to know¶

Step 6: Print your classification reports¶

Regression modeling with more than 2 categories¶

Correlation¶

A note about baseline measures¶

For your information...¶

Unsupervised Learning¶

Model Types¶

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	setosa	versicolor	virginica
Predicts_setosa	16	0	0
Predicts_versicolor	0	16	1
Predicts_virginica	0	0	17

Basics of Statistical Modeling¶

Types of Statistical Models¶

Variables: Quantitative vs Categorical¶

Note: Finals Data¶

Binomial Logistic Regression (Classification for only 2 categories)¶

Training a model¶

Splitting Your Data Into Training And Test Sets¶

Supervised learning¶

Step 1: Convert your 'Class' column of data to a single binary variable with zeros and ones.¶

Step 2: Divide your data into two subsets:¶

Step 3: Split your data into training and test sets using the sklearn packages.¶

Step 4: Run (i.e., "fit") the model with LogisticRegression¶

Step 5: Print your model coefficients¶

Step 6: Compute a confusion matrix showing your model results¶

True positives: 92¶

True negatives: 159¶

False positives: 35¶

False negatives: 28¶

What do we infer from the Confusion Matrix?¶

Precision and Recall¶

Precision =¶

Recall (a.k.a., sensitivity) =¶

F1-measure =¶

Need to know¶

Step 6: Print your classification reports¶

Regression modeling with more than 2 categories¶

Correlation¶

A note about baseline measures¶

For your information...¶

Unsupervised Learning¶

Model Types¶

Step 3: Split your data into training and test sets using the `sklearn` packages.¶

Step 4: Run (i.e., "fit") the model with `LogisticRegression`¶