DAY57. Python TextMining (2)WebCrawling (์ ํ์, ๋ด์ค ํฌ๋กค๋ง)
selector
์ ํ์(selector) : ์น๋ฌธ์ ๋์์ธ์ฉ
์ข
๋ฅ : id(#), calss
- id : ์ค๋ณต ๋ถ๊ฐ -> 1๊ฐ์ tag์ ํ (๊ธฐ์ฌ ํค๋๋ผ์ธ ์ถ์ถ์ฉ)
- class : ์ค๋ณต ๊ฐ๋ฅ -> n๊ฐ tag์ ํ (์ฌ๋ฌ ๊ฐ ๋ฌธ๋จ์ผ๋ก ๊ตฌ์ฑ ๋ ๋ด์ฉ ์ถ์ถ์ฉ)
html.select_one(#'id๋ช
')
html.select('.class๋ช
')
from bs4 import BeautifulSoup #html ํ์ฑ
1. html source ๊ฐ์ ธ์ค๊ธฐ
path = r'C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data\html03.html'
file = open(path, mode = 'r', encoding = 'UTF-8') #r = ์ฝ๊ธฐ๋ชจ๋, encoding = htmlํ์ผ์์ ํ์ธ
src = file.read()
file.close()
2. html ํ์ฑ
html = BeautifulSoup(src, 'html.parser')
print(html)
3. ์ ํ์ ์ด์ฉํ ํ๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ
1) id์ ํ์
table = html.select_one('#tab') #id='tab'
print(table)
2) ์ ํ์์ ๊ณ์ธต๊ตฌ์กฐ
ths = html.select('#tab > tr > th') #<table> > <tr> > <th> -> ํ ์์ 4๊ฐ์ ํ๊ทธ ํ๊บผ๋ฒ์ ๊บผ๋ด๊ธฐ
print(ths) #[<th id="id"> ํ๋ฒ </th>, <th id="name"> ์ด๋ฆ </th>, <th id="major"> ํ๊ณผ </th>, <th id="email"> ์ด๋ฉ์ผ </th>]
for th in ths :
print(th.text)
ํ๋ฒ
์ด๋ฆ
ํ๊ณผ
์ด๋ฉ์ผ
3) class ์ ํ์ : '.class๋ช '
trs = html.select('.odd') #calss = "odd"
print(trs) #list๋ฐํ
4) ํ๊ทธ[์์ฑ = '๊ฐ']
trs2 = html.select("tr[class='odd']") #odd์์ฑ์ ๊ฐ์ง tr๋ง ์ ํํ๊ฒ ๋ค (5ํ ์ค 2ํ๋ง ์ ํ)
print(trs2) #list๋ฐํ. 3, 4๋ฒ์ ๊ณผ์ ์ ๊ฐ์ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํ๋ค. 4๋ฒ์ ๋ฐฉ๋ฒ์ ๋ ๋ง์ด ์ฌ์ฉํจ.
[<tr class="odd"> <!-- 3ํ(ํ์) -->
<td> 201602 </td>
<td> ์ด์์ </td>
<td> ํด์ํ๊ณผ </td>
<td> lee@naver.com </td>
</tr>, <tr class="odd"> <!-- 5ํ -->
<td> 201604 </td>
<td> ์ ๊ด์ </td>
<td> ์ ์๊ต์ก </td>
<td> you@naver.com </td>
</tr>]
ํ์ ํ ๋ด์ฉ ์ถ๋ ฅ
for tr in trs2 :
tds = tr.find_all('td') #list
for td in tds :
print(td.text) #๋ด์ฉ ์ถ๋ ฅ
newsCrawling
ํ์ฌ ์๊ฐ news Crawling
url : http://media.daum.net
import urllib.request as req #url ์์ฒญ
from bs4 import BeautifulSoup #html ํ์ฑ
url = "http://media.daum.net"
1. url ์์ฒญ
res = req.urlopen(url)
src = res.read() # source ์ฝ๊ธฐ
print(src)
2. ๋์ฝ๋ฉ & html ํ์ฑ
data = src.decode('utf-8')
html = BeautifulSoup(data, 'html.parser')
print(html)
3. ํ๊ทธ[์์ฑ=๊ฐ] ์์ ์ถ์ถ
links = html.select('a[class="link_txt"]') #<a href="https://v.daum.net/v/20211126102955289" class="link_txt" data-tiara-layer="article" data-tiara-id="20211126102955289" data-tiara-type="harmony" data-tiara-ordnum="4" data-tiara-custom="contentUniqueKey=hamny-20211126102955289">'ํ์ ์์ฌ' ๋ฐ๋ฐ์๋..๊ณต์์ฒ, ์์ ๋๋ก ๋๊ฒ ์์์์</a>
len(links) # 62
print(links)
a ํ๊ทธ ๋ด์ฉ ์ถ์ถ
contents = [] #๋ด์ฉ ์ ์ฅ
cnt = 0
for link in links :
tmp = str(link.text) #๋ฌธ์์ด ๋ณํ
contents.append(tmp.strip()) #๋ฌธ์ฅ ๋ ๋ถ์ฉ์ด(๊ณต๋ฐฑ,์ ์ด๋ฌธ์) ์ ๊ฑฐ
print(cnt, '->', tmp.strip())
cnt += 1 #์นด์ดํฐ
4. url ์์ง : ์์ธ news ์์ง ๋ชฉ์
urls = [] #url ์ ์ฅ
for link in links[:46] :
try :
urls.append(link.attrs['href'])#a ํ๊ทธ์ url ์ถ์ถ
except Exception as e :
print('์์ธ๋ฐ์ : ',e)
print(urls)
len(urls) #46
5. news ๊ด๋ จ url ์ ๋ณ
https://news.v.daum.net
import re
url_pat = re.compile('^https://news.v.daum.net') #ํจํด ๊ฐ์ฒด ์์ฑ
ํจํด๊ณผ ์ผ์นํ url ์ ๋ณ
news_urls = [url for url in urls if url_pat.match(url)]
len(news_urls) #17
print(news_urls)
6. Crawler ํจ์ : ๋ฌธ์ ์์ง ์ญํ
def crawler_fn(url) :
1. url ์์ฒญ
def crawler_fn(url) :
#1. url ์์ฒญ
res = req.urlopen(url)
src = res.read() #source ์ฝ๊ธฐ
#2. ๋์ฝ๋ฉ & html ํ์ฑ
data = src.decode('utf-8')
html = BeautifulSoup(data, 'html.parser')
#3. ์ ๋ชฉ๊ณผ ๋ด์ฉ ์์ง
#1) ์ ๋ชฉ ์์ง : 1๊ฐ tag
title = str(html.select_one('h3[class="tit_view"]').text).strip()
#2) ๋ด์ฉ ์์ง : ์ฌ๋ฌ๊ฐ tag
'''
div.news_view > div.article_view > section > p
'''
article = html.select('div[class="news_view"] > div[class="article_view"] > section > p')
#4. ์ฌ๋ฌ๊ฐ ๋ฌธ๋จ(p) -> ํ ๊ฐ์ ๋ณ์๋ก ๋ฌถ์
conts = ""
for p in article :
text = str(p.text).strip()
conts += text #ํ
์คํธ ๋์
return title, conts
Crawler ํจ์ ํธ์ถ
titles = [] #์ ๋ชฉ ์ ์ฅ
news = [] #๋ด์ฉ ์ ์ฅ
for url in news_urls :
title, conts = crawler_fn(url) #ํจ์ ํธ์ถ
titles.append(title) #์ ๋ชฉ ์ ์ฅ
news.append(conts) #๋ด์ฉ ์ ์ฅ
print(titles)
print(news)
len(titles) #17
len(news) #17
news[0]
news[-1]
7. csv file save
import pandas as pd
daum_news = pd.DataFrame({'titles':titles, 'news':news},
columns=['titles','news'])
daum_news.info()
daum_news.head()
path = r"C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data"
ํ๋ฒํธ ์ ์ฅ ์ํจ : index=None
daum_news.to_csv(path + '/daum_news.csv', index=None)
news = pd.read_csv(path + '/daum_news.csv')
news
news Query Crawling
1. http://media.daum.net -> ๋ฐ๋ก๊ฐ๊ธฐ : ๋ฐฐ์ด์ด๋ ฅ
ํน์ ๋ ์ง์ ํ์ด์ง ์ ํ
2. https://news.daum.net/newsbox?regDate=20211119&tab_cate=NE&page=1
f'https://news.daum.net/newsbox?regDate={date}&tab_cate=NE&page={page}'
๋
๋๋ณ ๋ด์ค ์์ง : 10๋
๊ฐ ๋ด์ค ์์ง
ex) 20111030 ~ 20211030
import urllib.request as req #url ์์ฒญ
from bs4 import BeautifulSoup #html ํ์ฑ
import pandas as pd #date ์์ฑ
์์ง๊ธฐ๊ฐ : 20201101 ~ 20210330 : 5๊ฐ์, page : 5์ฉ ๋จ์
1. ์์ง๊ธฐ๊ฐ date ์์ฑ
dates = pd.date_range(start="2020-11-01", end = "2021-03-30") #5๊ฐ์
print(dates)
import re #๋ ์ง ์ ์ฒ๋ฆฌ
Dates = []
for date in dates :
#print(date) # 2020-11-01 00:00:00 -> 20201101
Dates.append(re.sub('-', '', str(date))[:8])
print(Dates)
2. Crawler ํจ์(๋ ์ง, ํ์ด์ง์)
def crawler_fn(date, pages=5) :
day_news = [] #1day news ์ ์ฅ
for page in range(1, pages+1) : #1 ~ 5 ํ์ด์ง
#1. url ๊ตฌ์ฑ
url = f'https://news.daum.net/newsbox?regDate={date}&tab_cate=NE&page={page}'
#2. url ์์ฒญ
res = req.urlopen(url)
src = res.read()
#3. ๋์ฝ๋ & html ํ์ฑ
data = src.decode('utf-8')
html = BeautifulSoup(data, 'html.parser')
#<a href="//v.daum.net/v/20201103235720871" class="link_txt">[์ฌ์ธต์ธํฐ๋ทฐ] ์ญ๋ '์ต๊ณ ํฌํ์จ'..๊น๋์ ๋ฏธ์ฃผํ์ธ์ ๊ถ์์ฐ๋ ๋ํ</a>
#4) a ํ๊ทธ ๋ด์ฉ ์์ง
links = html.select('a[class="link_txt"]') #list ๋ฐํ
page_news = [] #1page news
for a in links :
news = str(a.text).strip()
page_news.append(news)
print(page_news)
#1์ผ news ์ถ๊ฐ : 40์์ธ ์ดํ ๊ด๋ จ ๋ด์ค ์ ์ธ
day_news.extend(page_news[:40]) #[1page, 2page, 3page, 4page, 5page]
return day_news #1์ผ news ๋ด์ฉ
1์ผ news ์์ง ํ
์คํธ
day_news = crawler_fn('20201101')
day_news
len(day_news) #670
3. Crawler ํจ์ ํธ์ถ
crawling_data = []
for date in Dates :
day_news = crawler_fn(date) #1์ผ news - list
crawling_data.append(day_news) #150์ผ news - list
list ๋ดํฌ
crawling_data = [crawler_fn(date) for date in Dates]
crawling_data : [[day1],[day2],....[day150]]
print(crawling_data)
crawling_data[0] #day1 news
crawling_data[-1] #day150 news
len(crawling_data) #150
4. file save
import pickle #object -> binary file
path = r'C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data'
file = open(path + '/news_data.pkl', mode='wb')
pickle.dump(crawling_data, file)
file.close()
file load
file = open(path + '/news_data.pkl', mode='rb')
news_data = pickle.load(file)
print(news_data)
naver newsCrawling
import urllib.request as req #url ์์ฒญ
from bs4 import BeautifulSoup #html ํ์ฑ
import re #์ ๊ทํํ์ : url ์ ์
import pandas as pd #DataFrame(title + contents)
1. base url : naver ๋ด์ค ๊ฒ์
naver.com -> [๋ด์ค]ํด๋ฆญ -> ๊ฒ์์ด ์
๋ ฅ(๋ํต๋ น์ ๊ฑฐ) -> [์ต์
] -> ๊ธฐ๊ฐ:6๊ฐ์
-> 1page ํด๋ฆญ
-> 2page ํด๋ฆญ
https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=53&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start=11
-> 3page ํด๋ฆญ
https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=106&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start=21
query=๊ฒ์์ด
sort=์ ๋ ฌ๋ฐฉ์(0,1,2)
ds=๊ฒ์ ์์์ผ
de=๊ฒ์ ์ข ๋ฃ์ผ
start=์์งํ ํ์ด์ง๋ฒํธ(1page์ news๋งํฌ 10๊ฐ ํฌํจ)
ex) 1page -> start=1, 2page -> start=11, 3page -> start=21
page๋ฒํธ vs start๋ณ์
pages = 10 #1~10ํ์ด์ง
max_pages = (pages-1) * 10 + 1 #21
for start in range(1, max_pages+1, 10) : #1~21, step=10
print(start)
1 -> 1
2 -> 11
3 -> 21
:
10 -> 91
2. url ์ถ์ถ : ๋ค์ด๋ฒ ๋ด์ค ๋งํฌ ์์ง
def extract_url(pages) : #ํ์ด์ง ๋ฒํธ
max_pages = (pages-1) * 10 + 1
news_urls = [] #๋ค์ด๋ฒ ๋ด์ค ๋งํฌ ์ ์ฅ
for start in range(1, max_pages+1, 10) :
# 1) base url ๊ตฌ์ฑ
url = f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=431&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start={start}"
print(url)
#2) url ์์ฒญ -> html source
res = req.urlopen(url)
src = res.read()
#3) ๋์ฝ๋ฉ & html ํ์ฑ
data = src.decode('utf-8')
html = BeautifulSoup(data, 'html.parser')
#4) url ์์ง
'''
๋ค์ด๋ฒ ๋ด์ค url ์์ง
<a href='url'> ๋ด์ฉ </a>
'''
links = html.select("div.news_area > div.news_info > div.info_group > a.info")
#href ์์ฑ ์ถ์ถ : url ์์ง
urls = []
for link in links :
urls.append(link.attrs['href'])
print('urls ์ : ', len(urls)) #urls ์ : 15
#print(urls)
#5) url ์ ์ : https://news.naver.com
url_patt = re.compile('^https://news.naver.com')
page_url = [url for url in urls if url_patt.match(url)]
news_urls.extend(page_url) #๋จ์ผlist
return news_urls
ํจ์ ํธ์ถ
news_urls = extract_url(10) #1~10 ํ์ด์ง
len(news_urls) # 60
news_urls
3. Crawler ํจ์
def crawler_fn(news_urls) :
titles = [] # ์ ๋ชฉ
conts = [] # ๋ด์ค ๋ด์ฉ
for url in news_urls :
#1) url ์์ฒญ
res = req.urlopen(url)
src = res.read()
#2) ๋์ฝ๋ฉ & html ํ์ฑ
data = src.decode('euc-kr')
html = BeautifulSoup(data, 'html.parser')
# 3) ์ ๋ชฉ๊ณผ ๋ด์ฉ ์์ง
title = html.select_one('div.article_info > h3#articleTitle')
titles.append(str(title.text).strip())
article = html.select_one('div#articleBody > div#articleBodyContents')
conts.append(str(article.text).strip())
#print('title :', titles)
#print('conts :', conts)
return titles, conts
news ์ ๋ชฉ๊ณผ ๋ด์ฉ ์์ง
titles, conts = crawler_fn(news_urls)
len(titles) #60
len(conts) #60
4. csv file save
naver_news = pd.DataFrame({'title' : titles, 'contents': conts},
columns = ['title', 'contents'])
naver_news.info()
path = r'C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data'
naver_news.to_csv(path + '/naver_news.csv', index=None)