๊ทธ๋ž˜ํ”„ ์‹œ๊ฐํ™”

1. ์ œ์ž‘๋น„๋ณ„

0. ๋ชจ๋“ˆ ์ž„ํฌํŠธ

import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns



1. ๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ

movie=pd.read_csv(r'๊ฒฝ๋กœ\ํŒŒ์ผ๋ช….csv')
movie



2. ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ

movie.isna().sum() #๊ฒฐ์ธก์น˜ ํ™•์ธ
movie = movie.dropna(axis=0) #๊ฒฐ์ธก์น˜ ํ–‰ ์‚ญ์ œ
movie

movie["์ถ”์ • ์ œ์ž‘๋น„"]
movie["์ถ”์ • ์ œ์ž‘๋น„"].describe(include='all')

์„ฑ๊ณต
count     49.000000
mean     124.285714
std       71.219730
min       35.000000
25%       75.000000
50%      105.000000
75%      150.000000
max      430.000000
Name: ์ถ”์ • ์ œ์ž‘๋น„, dtype: float64

๋งํ•จ
count     47.000000
mean      71.234043
std       39.120890
min        5.000000
25%       46.000000
50%       60.000000
75%       84.000000
max      230.000000
Name: ์ถ”์ • ์ œ์ž‘๋น„, dtype: float64

cost_100 = movie.loc[movie["์ถ”์ • ์ œ์ž‘๋น„"] <= 100, ["์ถ”์ • ์ œ์ž‘๋น„"]] # 24 / 41
cost_100_200 = movie.loc[(movie["์ถ”์ • ์ œ์ž‘๋น„"] > 100) & (movie["์ถ”์ • ์ œ์ž‘๋น„"] <= 200), ["์ถ”์ • ์ œ์ž‘๋น„"]] # 19 / 5
cost_200 = movie.loc[(movie["์ถ”์ • ์ œ์ž‘๋น„"] > 200), ["์ถ”์ • ์ œ์ž‘๋น„"]] # 6 / 1



3. ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— tag์นผ๋Ÿผ์„ ์ถ”๊ฐ€ํ•ด ์ œ์ž‘๋น„ ๊ธฐ์ค€์œผ๋กœ ๊ฐ’์„ ์ž…๋ ฅ

movie["tag"] = 0 #0์œผ๋กœ ์ดˆ๊ธฐํ™”
movie["tag"]

def find_group(value):
    if value <= 100:
        return "100 ์ดํ•˜"
    elif value <= 200:
        return "100 ์ดˆ๊ณผ 200 ์ดํ•˜"
    else:
        return "200 ์ดˆ๊ณผ"
        
movie["tag"] = [find_group(e) for e in movie["์ถ”์ • ์ œ์ž‘๋น„"]]
movie["tag"]



4.๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์‹œ๊ฐํ™”

plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
        rc={"axes.unicode_minus":False}, style="white") #darkgrid, whitegrid, dark, white, ticks
ax = sns.countplot(x="tag", data=movie, 
                   order=movie["tag"].value_counts().index, #๋‚ด๋ฆผ์ฐจ์ˆœ
                   palette="Set3")


plt.xticks(rotation=270)

plt.ylabel("์˜ํ™” ์ˆ˜")
plt.xlabel("์ œ์ž‘๋น„ (๋‹จ์œ„:์–ต)")


sns.despine() # top, right ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ

sns.despine(left=True, bottom=True) #๋ชจ๋“  ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ


๊ฐ’ ํ‘œ์‹œ

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height, height, ha='center', size=9)

plt.show()

print(movie["tag"].value_counts())



5. ์‚ฐ์ ๋„ ์‹œ๊ฐํ™”

plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
        rc={"axes.unicode_minus":False}, style="whitegrid") #darkgrid, whitegrid, dark, white, ticks
sns.scatterplot(y="์ถ”์ • ์ œ์ž‘๋น„", x="์˜ํ™” ์ œ๋ชฉ", data=movie, hue="์ฒœ๋งŒ๊ด€๊ฐ๋ŒํŒŒ์—ฌ๋ถ€",
                palette="Set2")
sns.despine() #top, right ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ
plt.ylim([0, 440])
plt.ylabel("์ œ์ž‘๋น„ (๋‹จ์œ„:์–ต)")
plt.xticks(rotation=270)
plt.show()

 

 

 

 

 

2. ๋ฐฐ๊ธ‰์‚ฌ๋ณ„


1. ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

distributor = movie['๋ฐฐ๊ธ‰์‚ฌ'] #๋ฐฐ๊ธ‰์‚ฌ์˜ ๊ฒฝ์šฐ 'CJ ENM'์€ ๋„์–ด์“ฐ๊ธฐ ๊ธฐ์ค€์œผ๋กœ ๋‚˜๋ˆ„๋ฉด ์•ˆ๋˜๋ฏ€๋กœ
col_list = []

for d in distributor :
    if d ==  "CJ ENM" :
        col_list.append(d)
    else :
        d_split = d.split() #๊ณต๋ฐฑ ๋ถ„๋ฆฌ 
        for token in d_split :
            col_list.append(token)
            
col_list


๋ฆฌ์ŠคํŠธ -> ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„

col_name = ["๋ฐฐ๊ธ‰์‚ฌ"]
col_df = pd.DataFrame(col_list, columns=col_name)

######### ๋ช‡ ๊ฐœ ์ด์ƒ์ธ์ง€ ์ˆ˜์ •ํ•  ๋•Œ ############
##### ๋‹ค ๋ณผ ๋• ์ด ๋ฒ”์œ„ ์ „์ฒด ์ฃผ์„ ์ฒ˜๋ฆฌ #########
chart_data = col_df["๋ฐฐ๊ธ‰์‚ฌ"].value_counts()
index = chart_data.index # ์žฅ๋ฅด ์ถ”์ถœ 
values = chart_data.values # ๋นˆ๋„์ˆ˜ ์ถ”์ถœ 
    
choice_genre = [] # n ์ดˆ๊ณผ ์žฅ๋ฅด ์„ ํƒ 
for idx, val in zip(index, values):
    if val > 1: # ์ด ์ˆซ์ž๋ฅผ ์ˆ˜์ •
        choice_genre.append(idx)
    
col_df = col_df[col_df.isin(choice_genre)]
#############################################



2. ์‹œ๊ฐํ™”

plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
        rc={"axes.unicode_minus":False}, style="white") #darkgrid, whitegrid, dark, white, ticks
ax = sns.countplot(x="๋ฐฐ๊ธ‰์‚ฌ", data=col_df, 
                   order=col_df["๋ฐฐ๊ธ‰์‚ฌ"].value_counts().index, #๋‚ด๋ฆผ์ฐจ์ˆœ
                   palette="Set3")
plt.xticks(rotation=270)
plt.ylabel("์˜ํ™” ์ˆ˜")
#sns.despine() # top, right ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ
sns.despine(left=True, bottom=True) #๋ชจ๋“  ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ


๊ฐ’ ํ‘œ์‹œ

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height, height, ha='center', size=9)

plt.show()

print(col_df["๋ฐฐ๊ธ‰์‚ฌ"].value_counts())

 

 

 

 

 

3. ์žฅ๋ฅด๋ณ„

try : 
    if platform.system() == 'Windows':
    #์œˆ๋„์šฐ์ธ ๊ฒฝ์šฐ
        path = "C:\Windows\Fonts\malgun.ttf"
        font_name = font_manager.FontProperties(fname=path).get_name() 
        rc('font', family=font_name)
    #else:    
    #Mac ์ธ ๊ฒฝ์šฐ
        #rc('font', family='AppleGothic')
except :
    pass
matplotlib.rcParams['axes.unicode_minus'] = False


plt.figure(figsize=(20,5))
sns.set(style='white') #darkgrid, whitegrid, dark, white, ticks
sns.countplot(x='์žฅ๋ฅด', data=movie, palette="Set2")
#sns.despine() # top, right ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ
sns.despine(left=True, bottom=True) #๋ชจ๋“  ํ…Œ๋‘๋ฆฌ ์ œ๊ฑฐ
plt.show()

 

 

 

 

 

๋ณ€์ˆ˜ ์ถ”๊ฐ€ ์ˆ˜์ง‘

0. ๋ชจ๋“ˆ ์ž„ํฌํŠธ

import pandas as pd

 

 

1. ๊ฒฝ๋กœ ์ง€์ •

path = r'๊ฒฝ๋กœ๋ช…'

data = pd.read_csv(path + '/๋ฐฐ์šฐ๊ฐ๋…๋ณ€์ˆ˜.csv')
data.info()



2. ์„œ๋ธŒ์…‹ ๋งŒ๋“ค๊ธฐ 

df = data[['์˜ํ™” ์ œ๋ชฉ', '๋ˆ„์ ๊ด€๋žŒ๊ฐ ์ˆ˜', '๊ฐ๋…', '์ฃผ์—ฐ๋ฐฐ์šฐ']]
df

title = df['์˜ํ™” ์ œ๋ชฉ']
size = df['๋ˆ„์ ๊ด€๋žŒ๊ฐ ์ˆ˜']



4. ๊ณต๋ฐฑ ๊ธฐ์ค€์œผ๋กœ ๋ฌธ์ž์—ด ์ž๋ฅด๊ธฐ 

col_list1 = df['๊ฐ๋…'].str.split()
col_list1

col_list2 = df['์ฃผ์—ฐ๋ฐฐ์šฐ'].str.split()
col_list2



5. dataFrame ๋ฌถ๊ธฐ 

new_df = pd.DataFrame({'title':title, 'size' : size, 
                       'director': col_list1,
                       'actor' : col_list2}, 
                      columns=['title', 'size', 'director', 'actor'])

new_df.head()



6. ๊ฐ๋…๊ณผ ๋ฐฐ์šฐ ๊ธฐ์ค€์œผ๋กœ ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ 

new_title = [] # ๊ฐ๋… ๊ธฐ์ค€ ์˜ํ™”์ œ๋ชฉ 
new_title2 = [] # ๋ฐฐ์šฐ ๊ธฐ์ค€ ์˜ํ™”์ œ๋ชฉ 
new_size = [] # ๊ฐ๋… ๊ธฐ์ค€ ๋ˆ„์ ๊ด€๊ฐ์ˆ˜ 
new_size2 = [] # ๋ฐฐ์šฐ ๊ธฐ์ค€ ๋ˆ„์ ๊ด€๊ฐ์ˆ˜ 
new_director = [] # ๊ฐ๋…  
new_actor = [] # ๋ฐฐ์šฐ

 


7-1. ๊ฐ๋… ๊ธฐ์ค€ 

for i, directors in enumerate(new_df['director']) :
    for d in directors : 
        new_title.append(title[i]) # ์˜ํ™”์ œ๋ชฉ 
        new_size.append(size[i]) # ๊ฐ๋… ๊ธฐ์ค€ ๋ˆ„์ ๊ด€๊ฐ์ˆ˜   
        new_director.append(d) # ๊ฐ๋… 
        

director_df = pd.DataFrame({'title':new_title, 'size' : new_size, 
                       'director': new_director},
                           columns = ['title', 'size', 'director'])

director_df.info() # RangeIndex: 101 entries, 0 to 100
print(director_df)

 

7-2. ์ฃผ์—ฐ๋ฐฐ์šฐ ๊ธฐ์ค€ 

for i, actors in enumerate(new_df['actor']) :
    for a in actors : 
        new_title2.append(title[i]) # ์˜ํ™”์ œ๋ชฉ
        new_size2.append(size[i]) # ๋ฐฐ์šฐ ๊ธฐ์ค€ ๋ˆ„์ ๊ด€๊ฐ์ˆ˜   
        new_actor.append(a) # ๋ฐฐ์šฐ 

actor_df = pd.DataFrame({'title':new_title2, 'size' : new_size2, 
                       'actor': new_actor},
                        columns = ['title', 'size', 'actor'])

actor_df.info() # RangeIndex: 393 entries, 0 to 392
print(actor_df)




8. ๊ฐ๋… ๊ธฐ์ค€ ์ƒ์œ„ ๋ˆ„์ ๊ด€๊ฐ์ˆ˜ ํ‰๊ท  ์ƒ์œ„ 50์œ„ 

director_name = director_df['director'].unique()

names = []
size_avg = []
for i, name in enumerate(director_name) :
    names.append(name)
    df = director_df[director_df['director'] == name]
    size_avg.append(df['size'].mean())


df2 = pd.DataFrame({'director': names, 'size_avg' : size_avg})
df2

dir(df2)

sorted_value = df2['size_avg'].sort_values(ascending=False)
idx = sorted_value.index

result2 = df2.iloc[idx]
result2 


result2[:50]




9. ์ฃผ์—ฐ๋ฐฐ์šฐ ๊ธฐ์ค€ ์ƒ์œ„ ๋ˆ„์ ๊ด€๊ฐ์ˆ˜ ํ‰๊ท  ์ƒ์œ„ 50์œ„ 

actor_name = actor_df['actor'].unique()

names = []
size_avg = []
for i, name in enumerate(actor_name) :
    names.append(name)
    df = actor_df[actor_df['actor'] == name]
    size_avg.append(df['size'].mean())

names    
size_avg    

df = pd.DataFrame({'actor': names, 'size_avg' : size_avg})
df

dir(df)

help(df['size_avg'].sort_values)
sorted_value = df['size_avg'].sort_values(ascending=False)

idx = sorted_value.index

result = df.iloc[idx]
result 

result[:60]

 

 

 

10. ์—‘์…€ํŒŒ์ผ ์ €์žฅ

result.to_csv(r'๊ฒฝ๋กœ๋ช…\ํŒŒ์ผ๋ช….csv', sep=',', na_rep='NaN', encoding='utf-8-sig', mode='a', header=False, index=True)





+ Recent posts