import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt, matplotlib

matplotlib.rcParams['figure.figsize']=[12, 6]
matplotlib.rcParams['font.size']=24


data = pd.read_csv('../Datasets/TitanicSurvival.csv')


data.head()


#Find any null values (this shouldn't affect your projects, but can happen when you have incomplete data)
data.isnull().sum()

Unnamed: 0          0
survived            0
sex                 0
age               263
passengerClass      0
dtype: int64


data.dropna(inplace=True)
data.isnull().sum()

Unnamed: 0        0
survived          0
sex               0
age               0
passengerClass    0
dtype: int64


data['survived'].value_counts() # Count the (survived) column by counts of 'no/yes'
# to change to proportions instead of counts, pass the "value_counts" argument: normalize = True

no     619
yes    427
Name: survived, dtype: int64


data['survived'].value_counts(normalize=True).plot.bar() # plot categorical data (survived) by counts
# to change to proportions instead of counts, pass the "value_counts" argument: normalize = True

<AxesSubplot: >


pd.crosstab(data['sex'],data['survived'])


#investigate what features might be predictive of the yes/no classification
#the pandas crosstab function will count things based on some category/classification
pd.crosstab(data['sex'],data['survived']).plot.bar()

<AxesSubplot: xlabel='sex'>


pd.crosstab(data['passengerClass'],data['survived']).plot.bar()

<AxesSubplot: xlabel='passengerClass'>


fig, ax = plt.subplots()
data.boxplot(by='survived', ax=ax)
#get rid of default title
fig.suptitle("")
#set your own title
ax.set_title('Survival by Age', pad=20)

Text(0.5, 1.0, 'Survival by Age')


fig, ax = plt.subplots()
data.boxplot('age','passengerClass', ax=ax) #yaxis, xaxis
#possible slight correlation between age and passenger class (1st class tends to have slightly older people)
fig.suptitle("")
ax.set_title('Survival by Passenger Class', pad=20)

Text(0.5, 1.0, 'Survival by Passenger Class')


gender = pd.get_dummies(data['sex'],drop_first=True)
gender.head() #notice now only one sex category (defaults to first in list) and all items are members or not


#do the same for all categorical variables
pClass = pd.get_dummies(data['passengerClass'],drop_first=True)
survd = pd.get_dummies(data['survived'],drop_first=True)


data.drop(['Unnamed: 0','sex', 'survived','passengerClass'],axis=1,inplace=True)
data_dmy = pd.concat([survd,data,gender,pClass],axis=1) #good practice to put the target variable in the first (or last) column


data_dmy.head()


data_dmy.corr()


plt.pcolor(data_dmy.corr(), cmap='Blues') 
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x282b04bb0>


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#if you don't know what I'm indexing here, go back and watch the pandas videos from Lesson 22.
#"categories" is my set of dependent variables; the thing I am trying to classify by
categories = data_dmy.iloc[:,1:]
predictors = data_dmy.iloc[:,0]


#divide up data into four groups two training and two testing of both dependent and independent variables.
#test size is typically anywhere from .5 to .2 depending on how much data you have. 
#For class purposes a training of about 2/3 of data and testing with the other 1/3 is appropriate.m 
pred_train, pred_test, cat_train, cat_test = train_test_split(categories, predictors, test_size = .3, random_state=25)


model = LogisticRegression(solver='lbfgs') #for simple binary logistic regression we can use all default parameters
model.fit(pred_train, cat_train)
predictions = model.predict(pred_test)


model.coef_

array([[-0.02898469, -2.30018395, -1.14353657, -2.18113167]])


coeffs = pd.Series(model.coef_[0])
coeffs = dict(zip(data_dmy.columns[1:], model.coef_[0]))
coeffs

{'age': -0.028984688218593083,
 'male': -2.3001839469170244,
 '2nd': -1.143536572419241,
 '3rd': -2.181131669678145}


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(cat_test, predictions)#parameters are y_truth (i.e.,Ground truth (correct) target values), y_pred (i.e., predicted values from model).
cm

array([[159,  28],
       [ 35,  92]])


#Convert to dataframe for easier reading:
pd.DataFrame(cm, columns=['Died','Survived'], index=['Predicts_died','Predicts_survived'])


(92+159)/314

0.7993630573248408


92 / (92 + 35)

0.7244094488188977


92 / (92 + 28)

0.7666666666666667


   2 * ((.72 * .77) / (.72 + .77))

0.7441610738255033


from sklearn.metrics import classification_report, precision_recall_fscore_support
print(classification_report(cat_test, predictions))
#note we get precision and recall separately for positive and negative (e.g., 'What percentage of the model's negatives = False?' is also Precision)

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       187
           1       0.77      0.72      0.74       127

    accuracy                           0.80       314
   macro avg       0.79      0.79      0.79       314
weighted avg       0.80      0.80      0.80       314

	yes	age	male	2nd	3rd
yes	1.000000	-0.055513	-0.538000	0.038000	-0.286257
age	-0.055513	1.000000	0.063646	-0.014986	-0.337069
male	-0.538000	0.063646	1.000000	-0.028289	0.134063
2nd	0.038000	-0.014986	-0.028289	1.000000	-0.552848
3rd	-0.286257	-0.337069	0.134063	-0.552848	1.000000

Data Science part 2: Classification¶

Binomial (Binary) Logistic Regression¶

Load and visualize data¶

NEXT YEAR: take out stuff on multicollinearity. See this:¶

Checking for Multicollinearity¶

Model Evaluation Using the Confusion Matrix¶

True positives: 92¶

True negatives: 159¶

False positives: 35¶

False negatives: 28¶

What do we infer from the Confusion Matrix?¶

Precision and Recall¶

Precision =¶

Recall (a.k.a., sensitivity) =¶

F1-measure =¶

Need to know¶

Multinomial Logistic Regression¶

	Unnamed: 0	survived	sex	age	passengerClass
0	Allen, Miss. Elisabeth Walton	yes	female	29.0000	1st
1	Allison, Master. Hudson Trevor	yes	male	0.9167	1st
2	Allison, Miss. Helen Loraine	no	female	2.0000	1st
3	Allison, Mr. Hudson Joshua Crei	no	male	30.0000	1st
4	Allison, Mrs. Hudson J C (Bessi	no	female	25.0000	1st

	Died	Survived
Predicts_died	159	28
Predicts_survived	35	92