๊ฐ์ธ๊ณต๋ถ/Python
98. ํ์ด๋ ํ๋ก์ ํธ (3)๋ค์ด๋ฒ ๋ด์ค ํฌ๋กค๋ฌ ๋ง๋ค๊ธฐ
LEE_BOMB
2022. 1. 6. 21:32
naver ๋ด์ค ๊ฒ์ ํค์๋ ํน์ฑ
naver์์๋ 1๊ฐ ํค์๋ ๋น ์ต๋ 4,000๊ฐ ๊น์ง ์ ๊ณตํ๋ฏ๋ก ํฌ๋กค๋งํ ์ต๋ ํ์ด์ง๋ฅผ 1~400 ํ์ด์ง๊น์ง ์ง์ ๊ฐ๋ฅ
0. ํจํค์ง ์ํฌํธ
import urllib.request as req
from bs4 import BeautifulSoup
import re
import pandas as pd
1. url ์ถ์ถ : news ๋งํฌ ์์ง ํจ์ ์์ฑ
def extract_url(pages): #ํ์ด์ง ๋ฒํธ
maxPages =(pages-1)*10+1 #pages=10 -> start=91
news_urls = [] #๊ฐ page url ์ ์ฅ
page=1 # ํ์ด์ง ์ ์นด์ดํธ
for start in range(1, maxPages+1, 10) : #1, 11, 21, ... 91
#1) url ๊ตฌ์ฑ
url = f"๋งํฌ ์ฃผ์={start}"
print('page =', page)
page += 1
#2) url ์์ฒญ -> html source
res = req.urlopen(url)
data = res.read()
#3) html ํ์ฑ
src = data.decode('utf-8')
html = BeautifulSoup(src, 'html.parser')
links = html.select('div.news_area > div.news_info > div.info_group > a.info')
#4) url ์์ง
urls = [link.attrs['href'] for link in links]
#5) url ์ ์ : ๋ค์ด๋ฒ ๋ด์ค url๋ง ์ถ์ถ
url_pattern = re.compile('^https://news.naver.com')
page_urls = [url for url in urls if url_pattern.match(url)]
news_urls.extend(page_urls) # page๋น url save
print('์ถ์ถ url ๊ฐ์ =', len(page_urls)) #ํด๋น ํ์ด์ง์์ ํฌ๋กค๋งํ url ๊ฐ์
return news_urls
2. url ์ถ์ถ & ํ์ธ
news_urls = extract_url(400) #100ํ์ด์ง๋ง ํฌ๋กค๋ง(์ต๋ 400 ํ์ด์ง๊น์ง ๊ฐ๋ฅ)
print(news_urls) #๋ค์ด๋ฒ ๋ด์ค url ํ์ธ
len(news_urls) #์ ์ฒด url ๊ฐ์
3. Crawler ํจ์(ํ์ด์ง, ๊ฒ์๋ ์ง)
def crawler_func(news_urls): #crawling url
titles = [] #์ ๋ชฉ
conts = [] #๋ด์ค ๋ด์ฉ
for url in news_urls : #1page๋น url
#1) url ์์ฒญ -> html source
res = req.urlopen(url)
data = res.read()
#print(data) # <meta charset="euc-kr">
#2) html ํ์ฑ
try : # decode ๋ฐ ํ๊ทธ ์๋ ๊ฒฝ์ฐ
src = data.decode('euc-kr')
html = BeautifulSoup(src, 'html.parser')
#select_one : 1๊ฐ tag ๋์
title = html.select_one('div.article_info > h3#articleTitle')
titles.append(str(title.text).strip())
article = html.select_one('div[id="articleBody"] > div[id="articleBodyContents"]')
conts.append(str(article.text).strip())#tag ํ
์คํธ ์์ง
#print(titles)
#print(conts)
except Exception as e:
print('์์ธ๋ฐ์ : ', e)
print('url :', url)
return titles, conts
4. ์ ๋ชฉ๊ณผ ๋ด์ฉ ์์ง
titles, conts = crawler_func(news_urls)
len(titles) #์ ๋ชฉ ์
len(conts) #๋ด์ฉ ์
print(conts[0]) #news ๋ด์ฉ์ค์์ ์ฒซ๋ฒ์งธ์ ๋ํ๋ ์๋ ๋ฌธ์ฅ์ ์ ์ธ ์ ๊ฑฐํจ
// flash ์ค๋ฅ๋ฅผ ์ฐํํ๊ธฐ ์ํ ํจ์ ์ถ๊ฐ\nfunction _flash_removeCallback() {}์ค๋ฐ๊ฟ3๊ฐ
#์ฒซ๋ฒ์งธ ๋ํ๋ ๋ฌธ์ฅ ์ ์ธ
remove_str ="// flash ์ค๋ฅ๋ฅผ ์ฐํํ๊ธฐ ์ํ ํจ์ ์ถ๊ฐ\nfunction _flash_removeCallback\(\) \{\}\n\n\n"
contents = [re.sub(remove_str,"", cont) for cont in conts]
contents[0] #๋ฌธ์ฅ ์ ์ธ๋จ
5. csv file save
naver_news = pd.DataFrame({'titles':titles, 'contents':contents},
columns = ['titles', 'contents'])
naver_news.info()
print(naver_news.head())
path = r'D:\ITWILL'
naver_news.to_csv(path+'/์ํ๋ช
_news.csv', index=None, encoding = "utf-8-sig")