๊ทธ๋ํ ์๊ฐํ
1. ์ ์๋น๋ณ
0. ๋ชจ๋ ์ํฌํธ
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
1. ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
movie=pd.read_csv(r'๊ฒฝ๋ก\ํ์ผ๋ช
.csv')
movie
2. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
movie.isna().sum() #๊ฒฐ์ธก์น ํ์ธ
movie = movie.dropna(axis=0) #๊ฒฐ์ธก์น ํ ์ญ์
movie
movie["์ถ์ ์ ์๋น"]
movie["์ถ์ ์ ์๋น"].describe(include='all')
์ฑ๊ณต
count 49.000000
mean 124.285714
std 71.219730
min 35.000000
25% 75.000000
50% 105.000000
75% 150.000000
max 430.000000
Name: ์ถ์ ์ ์๋น, dtype: float64
๋งํจ
count 47.000000
mean 71.234043
std 39.120890
min 5.000000
25% 46.000000
50% 60.000000
75% 84.000000
max 230.000000
Name: ์ถ์ ์ ์๋น, dtype: float64
cost_100 = movie.loc[movie["์ถ์ ์ ์๋น"] <= 100, ["์ถ์ ์ ์๋น"]] # 24 / 41
cost_100_200 = movie.loc[(movie["์ถ์ ์ ์๋น"] > 100) & (movie["์ถ์ ์ ์๋น"] <= 200), ["์ถ์ ์ ์๋น"]] # 19 / 5
cost_200 = movie.loc[(movie["์ถ์ ์ ์๋น"] > 200), ["์ถ์ ์ ์๋น"]] # 6 / 1
3. ๋ฐ์ดํฐํ๋ ์์ tag์นผ๋ผ์ ์ถ๊ฐํด ์ ์๋น ๊ธฐ์ค์ผ๋ก ๊ฐ์ ์
๋ ฅ
movie["tag"] = 0 #0์ผ๋ก ์ด๊ธฐํ
movie["tag"]
def find_group(value):
if value <= 100:
return "100 ์ดํ"
elif value <= 200:
return "100 ์ด๊ณผ 200 ์ดํ"
else:
return "200 ์ด๊ณผ"
movie["tag"] = [find_group(e) for e in movie["์ถ์ ์ ์๋น"]]
movie["tag"]
4.๋ง๋ ๊ทธ๋ํ ์๊ฐํ
plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic",
rc={"axes.unicode_minus":False}, style="white") #darkgrid, whitegrid, dark, white, ticks
ax = sns.countplot(x="tag", data=movie,
order=movie["tag"].value_counts().index, #๋ด๋ฆผ์ฐจ์
palette="Set3")
plt.xticks(rotation=270)
plt.ylabel("์ํ ์")
plt.xlabel("์ ์๋น (๋จ์:์ต)")
sns.despine() # top, right ํ
๋๋ฆฌ ์ ๊ฑฐ
sns.despine(left=True, bottom=True) #๋ชจ๋ ํ
๋๋ฆฌ ์ ๊ฑฐ
๊ฐ ํ์
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2., height, height, ha='center', size=9)
plt.show()
print(movie["tag"].value_counts())
5. ์ฐ์ ๋ ์๊ฐํ
plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic",
rc={"axes.unicode_minus":False}, style="whitegrid") #darkgrid, whitegrid, dark, white, ticks
sns.scatterplot(y="์ถ์ ์ ์๋น", x="์ํ ์ ๋ชฉ", data=movie, hue="์ฒ๋ง๊ด๊ฐ๋ํ์ฌ๋ถ",
palette="Set2")
sns.despine() #top, right ํ
๋๋ฆฌ ์ ๊ฑฐ
plt.ylim([0, 440])
plt.ylabel("์ ์๋น (๋จ์:์ต)")
plt.xticks(rotation=270)
plt.show()
2. ๋ฐฐ๊ธ์ฌ๋ณ
1. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
distributor = movie['๋ฐฐ๊ธ์ฌ'] #๋ฐฐ๊ธ์ฌ์ ๊ฒฝ์ฐ 'CJ ENM'์ ๋์ด์ฐ๊ธฐ ๊ธฐ์ค์ผ๋ก ๋๋๋ฉด ์๋๋ฏ๋ก
col_list = []
for d in distributor :
if d == "CJ ENM" :
col_list.append(d)
else :
d_split = d.split() #๊ณต๋ฐฑ ๋ถ๋ฆฌ
for token in d_split :
col_list.append(token)
col_list
๋ฆฌ์คํธ -> ๋ฐ์ดํฐํ๋ ์
col_name = ["๋ฐฐ๊ธ์ฌ"]
col_df = pd.DataFrame(col_list, columns=col_name)
######### ๋ช ๊ฐ ์ด์์ธ์ง ์์ ํ ๋ ############
##### ๋ค ๋ณผ ๋ ์ด ๋ฒ์ ์ ์ฒด ์ฃผ์ ์ฒ๋ฆฌ #########
chart_data = col_df["๋ฐฐ๊ธ์ฌ"].value_counts()
index = chart_data.index # ์ฅ๋ฅด ์ถ์ถ
values = chart_data.values # ๋น๋์ ์ถ์ถ
choice_genre = [] # n ์ด๊ณผ ์ฅ๋ฅด ์ ํ
for idx, val in zip(index, values):
if val > 1: # ์ด ์ซ์๋ฅผ ์์
choice_genre.append(idx)
col_df = col_df[col_df.isin(choice_genre)]
#############################################
2. ์๊ฐํ
plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic",
rc={"axes.unicode_minus":False}, style="white") #darkgrid, whitegrid, dark, white, ticks
ax = sns.countplot(x="๋ฐฐ๊ธ์ฌ", data=col_df,
order=col_df["๋ฐฐ๊ธ์ฌ"].value_counts().index, #๋ด๋ฆผ์ฐจ์
palette="Set3")
plt.xticks(rotation=270)
plt.ylabel("์ํ ์")
#sns.despine() # top, right ํ
๋๋ฆฌ ์ ๊ฑฐ
sns.despine(left=True, bottom=True) #๋ชจ๋ ํ
๋๋ฆฌ ์ ๊ฑฐ
๊ฐ ํ์
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2., height, height, ha='center', size=9)
plt.show()
print(col_df["๋ฐฐ๊ธ์ฌ"].value_counts())
3. ์ฅ๋ฅด๋ณ
try :
if platform.system() == 'Windows':
#์๋์ฐ์ธ ๊ฒฝ์ฐ
path = "C:\Windows\Fonts\malgun.ttf"
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
#else:
#Mac ์ธ ๊ฒฝ์ฐ
#rc('font', family='AppleGothic')
except :
pass
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(20,5))
sns.set(style='white') #darkgrid, whitegrid, dark, white, ticks
sns.countplot(x='์ฅ๋ฅด', data=movie, palette="Set2")
#sns.despine() # top, right ํ
๋๋ฆฌ ์ ๊ฑฐ
sns.despine(left=True, bottom=True) #๋ชจ๋ ํ
๋๋ฆฌ ์ ๊ฑฐ
plt.show()
๋ณ์ ์ถ๊ฐ ์์ง
0. ๋ชจ๋ ์ํฌํธ
import pandas as pd
1. ๊ฒฝ๋ก ์ง์
path = r'๊ฒฝ๋ก๋ช
'
data = pd.read_csv(path + '/๋ฐฐ์ฐ๊ฐ๋
๋ณ์.csv')
data.info()
2. ์๋ธ์
๋ง๋ค๊ธฐ
df = data[['์ํ ์ ๋ชฉ', '๋์ ๊ด๋๊ฐ ์', '๊ฐ๋
', '์ฃผ์ฐ๋ฐฐ์ฐ']]
df
title = df['์ํ ์ ๋ชฉ']
size = df['๋์ ๊ด๋๊ฐ ์']
4. ๊ณต๋ฐฑ ๊ธฐ์ค์ผ๋ก ๋ฌธ์์ด ์๋ฅด๊ธฐ
col_list1 = df['๊ฐ๋
'].str.split()
col_list1
col_list2 = df['์ฃผ์ฐ๋ฐฐ์ฐ'].str.split()
col_list2
5. dataFrame ๋ฌถ๊ธฐ
new_df = pd.DataFrame({'title':title, 'size' : size,
'director': col_list1,
'actor' : col_list2},
columns=['title', 'size', 'director', 'actor'])
new_df.head()
6. ๊ฐ๋
๊ณผ ๋ฐฐ์ฐ ๊ธฐ์ค์ผ๋ก ์๋ก์ด ๋ฐ์ดํฐํ๋ ์ ์์ฑ
new_title = [] # ๊ฐ๋
๊ธฐ์ค ์ํ์ ๋ชฉ
new_title2 = [] # ๋ฐฐ์ฐ ๊ธฐ์ค ์ํ์ ๋ชฉ
new_size = [] # ๊ฐ๋
๊ธฐ์ค ๋์ ๊ด๊ฐ์
new_size2 = [] # ๋ฐฐ์ฐ ๊ธฐ์ค ๋์ ๊ด๊ฐ์
new_director = [] # ๊ฐ๋
new_actor = [] # ๋ฐฐ์ฐ
7-1. ๊ฐ๋
๊ธฐ์ค
for i, directors in enumerate(new_df['director']) :
for d in directors :
new_title.append(title[i]) # ์ํ์ ๋ชฉ
new_size.append(size[i]) # ๊ฐ๋
๊ธฐ์ค ๋์ ๊ด๊ฐ์
new_director.append(d) # ๊ฐ๋
director_df = pd.DataFrame({'title':new_title, 'size' : new_size,
'director': new_director},
columns = ['title', 'size', 'director'])
director_df.info() # RangeIndex: 101 entries, 0 to 100
print(director_df)
7-2. ์ฃผ์ฐ๋ฐฐ์ฐ ๊ธฐ์ค
for i, actors in enumerate(new_df['actor']) :
for a in actors :
new_title2.append(title[i]) # ์ํ์ ๋ชฉ
new_size2.append(size[i]) # ๋ฐฐ์ฐ ๊ธฐ์ค ๋์ ๊ด๊ฐ์
new_actor.append(a) # ๋ฐฐ์ฐ
actor_df = pd.DataFrame({'title':new_title2, 'size' : new_size2,
'actor': new_actor},
columns = ['title', 'size', 'actor'])
actor_df.info() # RangeIndex: 393 entries, 0 to 392
print(actor_df)
8. ๊ฐ๋
๊ธฐ์ค ์์ ๋์ ๊ด๊ฐ์ ํ๊ท ์์ 50์
director_name = director_df['director'].unique()
names = []
size_avg = []
for i, name in enumerate(director_name) :
names.append(name)
df = director_df[director_df['director'] == name]
size_avg.append(df['size'].mean())
df2 = pd.DataFrame({'director': names, 'size_avg' : size_avg})
df2
dir(df2)
sorted_value = df2['size_avg'].sort_values(ascending=False)
idx = sorted_value.index
result2 = df2.iloc[idx]
result2
result2[:50]
9. ์ฃผ์ฐ๋ฐฐ์ฐ ๊ธฐ์ค ์์ ๋์ ๊ด๊ฐ์ ํ๊ท ์์ 50์
actor_name = actor_df['actor'].unique()
names = []
size_avg = []
for i, name in enumerate(actor_name) :
names.append(name)
df = actor_df[actor_df['actor'] == name]
size_avg.append(df['size'].mean())
names
size_avg
df = pd.DataFrame({'actor': names, 'size_avg' : size_avg})
df
dir(df)
help(df['size_avg'].sort_values)
sorted_value = df['size_avg'].sort_values(ascending=False)
idx = sorted_value.index
result = df.iloc[idx]
result
result[:60]
10. ์์ ํ์ผ ์ ์ฅ
result.to_csv(r'๊ฒฝ๋ก๋ช
\ํ์ผ๋ช
.csv', sep=',', na_rep='NaN', encoding='utf-8-sig', mode='a', header=False, index=True)
'๊ฐ์ธ๊ณต๋ถ > Python' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
104. ํ์ด๋ ํ๋ก์ ํธ (10)SVM, Naive Bayes๋ชจ๋ธ ๋ง๋ค๊ธฐ (0) | 2022.01.13 |
---|---|
103. ํ์ด๋ ํ๋ก์ ํธ (9)์๊ด๋ถ์ (0) | 2022.01.12 |
101. ํ์ด๋ ํ๋ก์ ํธ (7)๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ ์๋ํด๋ผ์ฐ๋ (0) | 2022.01.10 |
101. ํ์ด๋ ํ๋ก์ ํธ (6)๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ ํฌ๋กค๋ฌ ๋ง๋ค๊ธฐ (0) | 2022.01.09 |
100. ํ์ด๋ ํ๋ก์ ํธ (5)html ๊ธฐ๋ณธ (0) | 2022.01.08 |