DAY45. Python Matplot (3)Seaborn
Seaborn ํจํค์ง
๋ค์ํ ๋ฐฐ๊ฒฝ ํ
๋ง ์ ๊ณต
ํต๊ณ์ฉ ์ฐจํธ ์ ๊ณต
์์ฒด dataset ์ ๊ณต
import seaborn as sn #๋ณ์นญ
์ ๊ณต dataset ํ์ธ
print(sn.get_dataset_names())
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'taxis', 'tips', 'titanic']
len(sn.get_dataset_names()) # 19
2. dataset load
iris = sn.load_dataset(name='iris')
type(iris) #pandas.core.frame.DataFrame
iris.info()
iris.head()
tips = sn.load_dataset(name='tips')
tips.info()
titanic = sn.load_dataset(name='titanic')
titanic.info()
flights = sn.load_dataset(name='flights')
flights.info()
object vs category
object : ์์ ๋ณ๊ฒฝ์ด ๋ถ๊ฐ๋ฅํ ๋ฌธ์์ด
category : ์์ ๋ณ๊ฒฝ์ด ๊ฐ๋ฅํ ๋ฌธ์์ด
import seaborn as sn
1. object vs category
dataset load
tips = sn.load_dataset(name = 'tips')
titanic = sn.load_dataset(name = 'titanic')
titanic.info()
subset ์์ฑ
df = titanic[['survived', 'age', 'class', 'who']]
df.info()
2 class 891 non-null category
3 who 891 non-null object
category ์ ๋ ฌ
df.head()
0 survived 891 non-null int64
1 age 714 non-null float64
2 class 891 non-null category
3 who 891 non-null object
object ์ ๋ ฌ
df.sort_values(by = 'who') #์ค๋ฆ์ฐจ์
df['who'].unique() #array(['man', 'woman', 'child'], dtype=object)
'child' > 'man' > 'woman' : child(base)
object -> category
'man' > 'woman' > 'child' : man(base)
category ์์ : DF['column'].astype('type')
df['who_new'] = df['who'].astype('category')
df.head()
df.info()
3 who 891 non-null object
4 who_new 891 non-null category
๋ฒ์ฃผ ์์ ๋ณ๊ฒฝ
df['who_new'] = df['who_new'].cat.set_categories(['man', 'woman', 'child'])
R์์๋
df$who_new = fator(df$who_new, levels = c('man', 'woman', 'child'))
category ์ ๋ ฌ
df.sort_values(by = 'who_new')
2. ๋ฒ์ฃผํ ์๋ฃ ์๊ฐํ
1) ๊ทธ๋ํ ๋ฐฐ๊ฒฝ ์คํ์ผ
sn.set_style(style = 'darkgrid')
2) ๋ฒ์ฃผํ ์๋ฃ ์๊ฐํ
tips.head()
sn.countplot(x = 'smoker', data = tips) #2๊ฐ ๋ฒ์ฃผ
sn.countplot(x = 'class', data = titanic) #3๊ฐ ๋ฒ์ฃผ
sn.countplot(x = 'day', data = tips) #4๊ฐ ๋ฒ์ฃผ
continous
์ฐ์ํ ๋ณ์ ์๊ฐํ
ํ์คํ ๊ทธ๋จ, ์ฐ์ ๋, ์ฐ์ ๋ ํ๋ ฌ, boxplot
import matplotlib.pyplot as plt
import seaborn as sn
dataset load
iris = sn.load_dataset('iris')
tips = sn.load_dataset('tips')
type(iris) # pandas.core.frame.DataFrame
1. ํ์คํ ๊ทธ๋จ
iris.info()
x = iris.sepal_length # iris['sepal_length']
sn.histplot(data=x)
plt.title('sepal_length hist')
plt.show()
2. distplot : hist + kde
sn.distplot(x, bins=20, hist=True, kde=True)
plt.title('sepal_length hist & kde')
plt.show()
kde : ์ปค๋๋ฐ๋์ถ์
3. ์ฐ์ ๋ ํ๋ ฌ
sn.pairplot(data = DataFrame, hue='์ง๋จ๋ณ์')
sn.pairplot(data = iris, hue='species')
plt.show()
4. ์ฐ์ ๋ : ์ฐ์ํ vs ์ฐ์ํ
sn.scatterplot(x='sepal_length', y='petal_length',
data = iris)
plt.show()
hue='์ง๋จ๋ณ์'
sn.scatterplot(x='sepal_length', y='petal_length',
hue='species', data = iris)
plt.show()
5. boxplot
tips.info()
sn.boxplot(x='day', y='total_bill',
hue='sex', data=tips)
plt.show()
ํต๊ณ์ ๋ชจ๋ธ ๊ด๋ จ ์๊ฐํ
์๊ณ์ด๋ถ์, ์๊ด๋ถ์, ํ๊ท๋ถ์, ๋ถ๋ฅ๋ถ์
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd # object
dataset load
iris = sn.load_dataset('iris')
tips = sn.load_dataset('tips')
flights = sn.load_dataset('flights')
iris.info()
tips.info()
flights.info()
1. ์ค์ฐจ๋์ญํญ์ ๊ฐ๋ ์๊ณ์ด ์๊ฐํ
x : ์๊ฐ์ถ, y : ํต๊ณ๋
sn.lineplot(x='year', y='passengers',
data = flights)
plt.show()
hue ์ถ๊ฐ : ์ ๋จ์ ํ์น๊ฐ ์ถ์ธ
sn.lineplot(x='year', y='passengers',
hue='month', data = flights)
plt.show()
print(flights)
2. ์๊ด๋ถ์ : ์ฐ์ํ vs ์ฐ์ํ vs ์ด์ฐํ
sn.relplot(x='total_bill', y='tip',
size='size', data = tips)
plt.show()
์๊ด๊ณ์ : 0.6757341092113641
tips['total_bill'].corr(tips['tip'])
3. ์ ํํ๊ท๋ถ์ : ์ฐ์ ๋์ ํ๊ท์ , ์ ๋ขฐ๊ตฌ๊ฐ(ci=95)
sn.lmplot(x='sepal_length', y='petal_length',
data=iris, ci=95)
plt.show()
hue ์์ฑ : species
sn.lmplot(x='sepal_length', y='petal_length',
data=iris, hue='species', ci=95)
plt.show()
4. ๋ถ๋ฅ๋ถ์ : ํผ๋ํ๋ ฌ(confusion matrix)
1) y true vs y pred
y_true = pd.Series([1,0,1,0,0])
y_pred = pd.Series([1,0,1,0,1])
2) ํผ๋ํ๋ ฌ : ๊ต์ฐจ๋ถํ ํ
mat = pd.crosstab(index=y_true, columns=y_pred)
mat
col_0 0 1
row_0
0 2 1
1 0 2
type(mat) # pandas.core.frame.DataFrame
3) ๋ถ๋ฅ์ ํ๋
score = (mat.iloc[0,0] + mat.iloc[1,1]) / len(y_true)
print('๋ถ๋ฅ์ ํ๋ = ', score) # ๋ถ๋ฅ์ ํ๋ = 0.8
4) heatmap
sn.heatmap(data=mat, annot=True, fmt=".2f", linewidths=2)
annot=True : label ํ์
plt.ylabel("Real label")
plt.xlabel("Predicted label")
acc_score = f"Accuracy score : {score}"
plt.title(acc_score, size = 15)
plt.show()