๊ฐœ์ธ๊ณต๋ถ€/Python

96. ํŒŒ์ด๋„ ํ”„๋กœ์ ํŠธ (2)์™“์ฑ ํ”ผ๋””์•„ ํฌ๋กค๋Ÿฌ ๋งŒ๋“ค๊ธฐ

LEE_BOMB 2022. 1. 3. 23:27

0. ํŒจํ‚ค์ง€ ์ž„ํฌํŠธ

import os
import pandas as pd
from selenium import webdriver #module 
from selenium.webdriver.common.keys import Keys #enter key
from selenium.webdriver.common.action_chains import ActionChains
import time #ํ™”๋ฉด ์ผ์‹œ ์ •์ง€




1. ๊ฒ€์ƒ‰ํ•  ์˜ํ™”๋ช… ์ž…๋ ฅํ•˜๊ธฐ

query_txt = input('์˜ํ™”๋ช… :')



2. driver ๊ฐ์ฒด ์ƒ์„ฑ

path = r"๊ฒฝ๋กœ๋ช…"
driver = webdriver.Chrome(path + '/chromedriver.exe')



3. ์™“์ฑ ํ”ผ๋””์•„ url ์ด๋™

driver.get('https://pedia.watcha.com/ko-KR')

 

 

4. ์˜ํ™”๋ช… ๊ฒ€์ƒ‰ > ์—”ํ„ฐํ‚ค

time.sleep(1)
textbox = driver.find_element_by_xpath('//*[@id="root"]/div/div[1]/header/nav/div/div/ul/li[5]/div/div/form/label/input')
textbox.click()

element = driver.find_element_by_name("searchKeyword")
element.send_keys(query_txt + "\n")

 

์—๋Ÿฌ ๋ฐœ์ƒ

driver.find_element_by_name('searchKeyword').send_keys("์‹ ๊ณผํ•จ๊ป˜")
textbox.send_keys(Keys.ENTER)

[์—๋Ÿฌ] 'NoneType' object has no attribute 'send_keys'
[ํ•ด๊ฒฐ] https://stackoverflow.com/questions/58355552/how-to-combat-the-attributeerror-nonetype-object-has-no-attribute-send-keys

 


5. ๋ฆฌ๋ทฐํŽ˜์ด์ง€ ์ ‘์†
ํŠน์ • ์œ„์น˜ ํด๋ฆญ

time.sleep(2)
blank_click = driver.find_element_by_xpath('/html/body/div/div/div[1]/section/section/div[1]/div')
blank_click.click()


์˜ํ™” ํด๋ฆญ

movie_click = driver.find_element_by_xpath('/html/body/div/div/div[1]/section/section/div[3]/div[1]/section/section[1]/div/div[1]/div/ul/li[1]/a/div[1]/div[1]/img')
movie_click.click()


์Šคํฌ๋กค ๋‚ด๋ฆฌ๊ธฐ

time.sleep(2)
driver.execute_script("window.scrollTo(0, 1000)")
#ํ•œ๊ณ„์  ๋ฐœ๊ฒฌ -> ๋†’์ด๋Š” ์ ˆ๋Œ€์ ์ธ ๊ฒƒ์ด ์•„๋‹ˆ๋ผ ์ƒ๋Œ€์ . ๋ชจ๋‹ˆํ„ฐ ํ•ด์ƒ๋„ ๋”ฐ๋ผ ์ˆซ์ž ์กฐ์ •ํ•  ํ•„์š”๊ฐ€ ์žˆ์Œ


๋”๋ณด๊ธฐ ํด๋ฆญ

more_view = driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/section/div/div[2]/div/div/div/div[1]/div[1]/div/div/section[5]/div[1]/div/header/div/div/a')
more_view.click()


๊ฐฑ์‹ ๋˜๋Š” ํŽ˜์ด์ง€ ์Šคํฌ๋กค ๋๊นŒ์ง€ ๋‚ด๋ฆฌ๊ธฐ

last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    #๋๊นŒ์ง€ ์Šคํฌ๋กค ๋‚ด๋ฆฌ๊ธฐ
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    #๋Œ€๊ธฐ
    time.sleep(1)

    #์Šคํฌ๋กค ๋‚ด๋ฆฐ ํ›„ ์Šคํฌ๋กค ๋†’์ด ๋‹ค์‹œ ๊ฐ€์ ธ์˜ด
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

time.sleep(5)



6. ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘
๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ

data = pd.DataFrame(data=[], columns=['movie_title', 'writer_name','star_grade'])


๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ํ•จ์ˆ˜ ์ƒ์„ฑ

def get_movie_reviews(driver, data, k):
    
    movie_title = query_txt
    writer_name = driver.find_elements_by_css_selector('.css-1agoci2')
    star_grade = driver.find_elements_by_css_selector('.css-yqs4xl')
       
    for i in range(k):
        tmp = []
        tmp.append(movie_title)
        tmp.append(writer_name[i].text)
        tmp.append(star_grade[i].text)
        
        tmp = pd.DataFrame(data=[tmp], columns=data.columns)
        data = pd.concat([data,tmp])
    
    print(movie_title + " ๋ฆฌ๋ทฐ ์ˆ˜์ง‘ ์™„๋ฃŒ")
    
    return data

 

[์—๋Ÿฌ] ๋ฆฌ๋ทฐ์–ด๋ช…๊ณผ ๋ณ„์ ์ด html์ฝ”๋“œ๊ฐ€ ๊ทธ๋Œ€๋กœ ๊ธํ˜€ ๋‚˜์˜จ๋‹ค.

[ํ•ด๊ฒฐ] css_selector ๊ด€๋ จ ๊ฒŒ์‹œ๋ฌผ๋“ค ์ฝ๊ณ , for๋ฌธ๊ณผ selector ์ˆ˜์ •ํ•ด์„œ ํ•ด๊ฒฐ

 


๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ํ•จ์ˆ˜ ์‹คํ–‰ (๋ฆฌ๋ทฐ 20๊ฐœ ์ˆ˜์ง‘)

review = get_movie_reviews(driver, data, 21)


star_grade์˜ '๋ณด๊ณ ์‹ถ์–ด์š”'๋Š” ์‚ญ์ œ

review = review[~review['star_grade'].str.contains("๋ณด๊ณ ์‹ถ์–ด์š”", na=False, case=False)]


ํ‰์  ํ‰๊ท  ์–ป์–ด์„œ ์ƒˆ๋กœ์šด ์—ด์— ์ถ”๊ฐ€ (์†Œ์ˆ˜์  ๋‘๋ฒˆ์งธ ์ž๋ฆฌ์—์„œ ๋ฐ˜์˜ฌ๋ฆผ)

review['star_grade'] = pd.to_numeric(review['star_grade'], downcast='float') #์ˆซ์žํ˜• ๋ณ€ํ™˜

review = review.append({'movie_title' : query_txt, 'writer_name' : 'ํ‰๊ท ', 'star_grade' : review['star_grade'].mean()}, ignore_index=True)
review = review.round(2)
print(review)


๋ฐ์ดํ„ฐ ๋ˆ„์  ์ €์žฅ

review.to_csv('๊ฒฝ๋กœ\ํŒŒ์ผ๋ช….csv', sep=',', na_rep='NaN', encoding='utf-8-sig', mode='a', header=False, index=True)
#mode='a' : ๋ˆ„์  ์ €์žฅ, seperator : ๊ตฌ๋ถ„์ž, na_rep : ๊ฒฐ์ธก๊ฐ’ ํ‘œ๊ธฐ, encoding : ๋ฌธ์ž ๊นจ์ง ๋ฐฉ์ง€, header=False : ์นผ๋Ÿผ์ œ๋ชฉ ํ‘œ๊ธฐX
print(query_txt + " ๋ฆฌ๋ทฐ ์ €์žฅ ์™„๋ฃŒ")

 

 

 

 

 

ํฌ๋กค๋Ÿฌ ์‹คํ–‰ ์˜์ƒ

 

 

 

 

์•„์‰ฌ์šด์  & ๋” ์ˆ˜์ •ํ•ด์•ผํ• ์ 

1. ์˜ํ™”๋ช…์ด ์™„๋ฒฝํ•˜๊ฒŒ ์ผ์น˜ํ•˜์ง€ ์•Š์œผ๋ฉด ์ฒซ ๋ฒˆ์งธ๋กœ ๋…ธ์ถœ๋˜์ง€ ์•Š์œผ๋ฏ€๋กœ, ๋‹ค๋ฅธ ์˜ํ™”๊ฐ€ ํด๋ฆญ๋  ์œ„ํ—˜์„ฑ์ด ์žˆ์Œ (ex. ๋งˆ์Šคํ„ฐ, ๋ฒ”์ฃ„๋„์‹œ2, ๋” ํ‚น)
2. ๋ณ„์ ์ด ์•„๋‹Œ '๋ณด๊ณ ์‹ถ์–ด์š”'๊ฐ€ ์กด์žฌ. ๊ฒฐ์ธก์น˜๋กœ ๋ฐ”๊พธ์–ด์ฃผ๋Š” ์ž‘์—…์ด ํ•„์š”ํ–ˆ์Œ -> ๊ตฌํ˜„ ์™„๋ฃŒ
3. ๋น„๊ณต๊ฐœ, ์‚ญ์ œ ๋“ฑ์˜ ์ด์œ ๋กœ ๋ณ„์ ์ด ํ‘œ์‹œ๋˜์ง€ ์•Š๋Š” ๋ฆฌ๋ทฐ๊ฐ€ ์ƒ์„ฑ๋˜๋ฉด ์ˆ˜๋™์œผ๋กœ ์ˆ˜์ง‘ ๋ฆฌ๋ทฐ ๊ฐœ์ˆ˜๋ฅผ ์ˆ˜์ •ํ•ด์ฃผ์–ด์•ผ ํ–ˆ์Œ
4. input์— ์˜ํ™” ์ด๋ฆ„์„ ํ•˜๋‚˜ํ•˜๋‚˜ ์ž…๋ ฅ๋ฐ›์•„์•ผ ํ•˜๋Š” ๊ฒŒ ๋ฒˆ๊ฑฐ๋กœ์›€