๋ฐ์ดํ„ฐ๋ถ„์„๊ฐ€ ๊ณผ์ •/Python

DAY57. Python TextMining (2)WebCrawling (์„ ํƒ์ž, ๋‰ด์Šค ํฌ๋กค๋ง)

LEE_BOMB 2021. 12. 10. 16:16
selector

์„ ํƒ์ž(selector) : ์›น๋ฌธ์„œ ๋””์ž์ธ์šฉ

 

์ข…๋ฅ˜ : id(#), calss
- id : ์ค‘๋ณต ๋ถˆ๊ฐ€ -> 1๊ฐœ์˜ tag์„ ํƒ (๊ธฐ์‚ฌ ํ—ค๋“œ๋ผ์ธ ์ถ”์ถœ์šฉ)
- class : ์ค‘๋ณต ๊ฐ€๋Šฅ -> n๊ฐœ tag์„ ํƒ (์—ฌ๋Ÿฌ ๊ฐœ ๋ฌธ๋‹จ์œผ๋กœ ๊ตฌ์„ฑ ๋œ ๋‚ด์šฉ ์ถ”์ถœ์šฉ)

html.select_one(#'id๋ช…')
html.select('.class๋ช…')

 

 

from bs4 import BeautifulSoup #html ํŒŒ์‹ฑ

 

1. html source ๊ฐ€์ ธ์˜ค๊ธฐ

path = r'C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data\html03.html'
file = open(path, mode = 'r', encoding = 'UTF-8') #r = ์ฝ๊ธฐ๋ชจ๋“œ, encoding = htmlํŒŒ์ผ์—์„œ ํ™•์ธ
src = file.read()
file.close()

 

 

 

2. html ํŒŒ์‹ฑ

html = BeautifulSoup(src, 'html.parser')
print(html)

 

 

 

3. ์„ ํƒ์ž ์ด์šฉํ•œ ํƒœ๊ทธ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ

1) id์„ ํƒ์ž

table = html.select_one('#tab') #id='tab'
print(table)

 

2) ์„ ํƒ์ž์™€ ๊ณ„์ธต๊ตฌ์กฐ

ths = html.select('#tab > tr > th') #<table> > <tr> > <th> -> ํ–‰ ์•ˆ์˜ 4๊ฐœ์˜ ํƒœ๊ทธ ํ•œ๊บผ๋ฒˆ์— ๊บผ๋‚ด๊ธฐ
print(ths) #[<th id="id"> ํ•™๋ฒˆ </th>, <th id="name"> ์ด๋ฆ„ </th>, <th id="major"> ํ•™๊ณผ </th>, <th id="email"> ์ด๋ฉ”์ผ </th>]

for th in ths :
    print(th.text)

ํ•™๋ฒˆ
์ด๋ฆ„
ํ•™๊ณผ
์ด๋ฉ”์ผ

 

3) class ์„ ํƒ์ž : '.class๋ช…'

trs = html.select('.odd') #calss = "odd"
print(trs) #list๋ฐ˜ํ™˜

 

4) ํƒœ๊ทธ[์†์„ฑ = '๊ฐ’']

trs2 = html.select("tr[class='odd']") #odd์†์„ฑ์„ ๊ฐ€์ง„ tr๋งŒ ์„ ํƒํ•˜๊ฒ ๋‹ค (5ํ–‰ ์ค‘ 2ํ–‰๋งŒ ์„ ํƒ)
print(trs2) #list๋ฐ˜ํ™˜. 3, 4๋ฒˆ์˜ ๊ณผ์ •์€ ๊ฐ™์€ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค. 4๋ฒˆ์˜ ๋ฐฉ๋ฒ•์„ ๋” ๋งŽ์ด ์‚ฌ์šฉํ•จ.

[<tr class="odd"> <!-- 3ํ–‰(ํ™€์ˆ˜) -->
<td> 201602 </td>
<td> ์ด์ˆœ์‹  </td>
<td> ํ•ด์–‘ํ•™๊ณผ </td>
<td> lee@naver.com </td>
</tr>, <tr class="odd"> <!-- 5ํ–‰ -->
<td> 201604 </td>
<td> ์œ ๊ด€์ˆœ </td>
<td> ์œ ์•„๊ต์œก </td>
<td> you@naver.com </td>
</tr>]

 

ํ™€์ˆ˜ ํ–‰ ๋‚ด์šฉ ์ถœ๋ ฅ

for tr in trs2 :    
    tds = tr.find_all('td') #list 
    for td in tds :
        print(td.text) #๋‚ด์šฉ ์ถœ๋ ฅ

 

 

 

 

 

 

newsCrawling

ํ˜„์žฌ ์‹œ๊ฐ news Crawling
url : http://media.daum.net

 

 

import urllib.request as req #url ์š”์ฒญ
from bs4 import BeautifulSoup #html ํŒŒ์‹ฑ

url = "http://media.daum.net"

 

1. url ์š”์ฒญ

res = req.urlopen(url)
src = res.read() # source ์ฝ๊ธฐ
print(src)

 

 

 

2. ๋””์ฝ”๋”ฉ & html ํŒŒ์‹ฑ

data = src.decode('utf-8')
html = BeautifulSoup(data, 'html.parser')
print(html)

 

 

 

3. ํƒœ๊ทธ[์†์„ฑ=๊ฐ’] ์š”์†Œ ์ถ”์ถœ

links = html.select('a[class="link_txt"]') #<a href="https://v.daum.net/v/20211126102955289" class="link_txt" data-tiara-layer="article" data-tiara-id="20211126102955289" data-tiara-type="harmony" data-tiara-ordnum="4" data-tiara-custom="contentUniqueKey=hamny-20211126102955289">'ํ‘œ์  ์ˆ˜์‚ฌ' ๋ฐ˜๋ฐœ์—๋„..๊ณต์ˆ˜์ฒ˜, ์˜ˆ์ •๋Œ€๋กœ ๋Œ€๊ฒ€ ์••์ˆ˜์ˆ˜์ƒ‰</a>
len(links) # 62
print(links)

 

a ํƒœ๊ทธ ๋‚ด์šฉ ์ถ”์ถœ

contents = [] #๋‚ด์šฉ ์ €์žฅ 
cnt = 0
for link in links :     
    tmp = str(link.text) #๋ฌธ์ž์—ด ๋ณ€ํ™˜ 
    contents.append(tmp.strip()) #๋ฌธ์žฅ ๋ ๋ถˆ์šฉ์–ด(๊ณต๋ฐฑ,์ œ์–ด๋ฌธ์ž) ์ œ๊ฑฐ 
    print(cnt, '->', tmp.strip())
    cnt += 1 #์นด์šดํ„ฐ

 

 

4. url ์ˆ˜์ง‘ : ์ƒ์„ธ news ์ˆ˜์ง‘ ๋ชฉ์ 

urls = [] #url ์ €์žฅ 

for link in links[:46] : 
    try : 
        urls.append(link.attrs['href'])#a ํƒœ๊ทธ์˜ url ์ถ”์ถœ 
    except Exception as e :
        print('์˜ˆ์™ธ๋ฐœ์ƒ : ',e)

print(urls)
len(urls) #46

 

 

 

5. news ๊ด€๋ จ url ์„ ๋ณ„
https://news.v.daum.net

import re

url_pat = re.compile('^https://news.v.daum.net') #ํŒจํ„ด ๊ฐ์ฒด ์ƒ์„ฑ

 

ํŒจํ„ด๊ณผ ์ผ์น˜ํ•œ url ์„ ๋ณ„

news_urls = [url for url in urls if url_pat.match(url)]

len(news_urls) #17
print(news_urls)

 

 

 

6. Crawler ํ•จ์ˆ˜ : ๋ฌธ์„œ ์ˆ˜์ง‘ ์—ญํ• 
def crawler_fn(url) :
1. url ์š”์ฒญ

def crawler_fn(url) : 
    #1. url ์š”์ฒญ 
    res = req.urlopen(url)
    src = res.read() #source ์ฝ๊ธฐ
    
    #2. ๋””์ฝ”๋”ฉ & html ํŒŒ์‹ฑ 
    data = src.decode('utf-8')
    html = BeautifulSoup(data, 'html.parser')
    
    #3. ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ ์ˆ˜์ง‘ 
    #1) ์ œ๋ชฉ ์ˆ˜์ง‘ : 1๊ฐœ tag 
    title = str(html.select_one('h3[class="tit_view"]').text).strip()
    
    #2) ๋‚ด์šฉ ์ˆ˜์ง‘ : ์—ฌ๋Ÿฌ๊ฐœ tag 
    '''
    div.news_view > div.article_view > section > p    
    '''
    article = html.select('div[class="news_view"] > div[class="article_view"] > section > p')
    
    #4. ์—ฌ๋Ÿฌ๊ฐœ ๋ฌธ๋‹จ(p) -> ํ•œ ๊ฐœ์˜ ๋ณ€์ˆ˜๋กœ ๋ฌถ์Œ 
    conts = ""
    for p in article :
        text = str(p.text).strip()
        conts += text #ํ…์ŠคํŠธ ๋ˆ„์  
        
    return title, conts

 

Crawler ํ•จ์ˆ˜ ํ˜ธ์ถœ 

titles = [] #์ œ๋ชฉ ์ €์žฅ 
news = [] #๋‚ด์šฉ ์ €์žฅ 

for url in news_urls :  
    title, conts = crawler_fn(url) #ํ•จ์ˆ˜ ํ˜ธ์ถœ 
    titles.append(title) #์ œ๋ชฉ ์ €์žฅ 
    news.append(conts) #๋‚ด์šฉ ์ €์žฅ 

print(titles)
print(news)
len(titles) #17
len(news) #17
news[0]
news[-1]




7. csv file save 

import pandas as pd 

daum_news = pd.DataFrame({'titles':titles, 'news':news},
             columns=['titles','news'])

daum_news.info()

daum_news.head()

path = r"C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data"


ํ–‰๋ฒˆํ˜ธ ์ €์žฅ ์•ˆํ•จ : index=None

daum_news.to_csv(path + '/daum_news.csv', index=None)

news = pd.read_csv(path + '/daum_news.csv')
news

 

 

 

 

 

news Query Crawling

1. http://media.daum.net -> ๋ฐ”๋กœ๊ฐ€๊ธฐ : ๋ฐฐ์—ด์ด๋ ฅ
ํŠน์ • ๋‚ ์งœ์™€ ํŽ˜์ด์ง€ ์„ ํƒ 
2. https://news.daum.net/newsbox?regDate=20211119&tab_cate=NE&page=1 
f'https://news.daum.net/newsbox?regDate={date}&tab_cate=NE&page={page}'

๋…„๋„๋ณ„ ๋‰ด์Šค ์ˆ˜์ง‘ : 10๋…„๊ฐ„ ๋‰ด์Šค ์ˆ˜์ง‘ 
 ex) 20111030 ~ 20211030


import urllib.request as req #url ์š”์ฒญ 
from bs4 import BeautifulSoup #html ํŒŒ์‹ฑ
import pandas as pd #date ์ƒ์„ฑ

 

์ˆ˜์ง‘๊ธฐ๊ฐ„ : 20201101 ~ 20210330 : 5๊ฐœ์›”, page : 5์”ฉ ๋‹จ์œ„ 


1. ์ˆ˜์ง‘๊ธฐ๊ฐ„ date ์ƒ์„ฑ 

dates = pd.date_range(start="2020-11-01", end = "2021-03-30") #5๊ฐœ์›” 
print(dates)

import re #๋‚ ์งœ ์ „์ฒ˜๋ฆฌ 

Dates = []
for date in dates :
    #print(date) # 2020-11-01 00:00:00 -> 20201101
    Dates.append(re.sub('-', '', str(date))[:8])
    
print(Dates)




2. Crawler ํ•จ์ˆ˜(๋‚ ์งœ, ํŽ˜์ด์ง€์ˆ˜)

def crawler_fn(date, pages=5) :
    day_news = [] #1day news ์ €์žฅ 
    
    for page in range(1, pages+1) : #1 ~ 5 ํŽ˜์ด์ง€ 
        #1. url ๊ตฌ์„ฑ 
        url = f'https://news.daum.net/newsbox?regDate={date}&tab_cate=NE&page={page}' 

        #2. url ์š”์ฒญ 
        res = req.urlopen(url)
        src = res.read()
        
        #3. ๋””์ฝ”๋“œ & html ํŒŒ์‹ฑ 
        data = src.decode('utf-8')
        html = BeautifulSoup(data, 'html.parser')
        
        #<a href="//v.daum.net/v/20201103235720871" class="link_txt">[์‹ฌ์ธต์ธํ„ฐ๋ทฐ] ์—ญ๋Œ€ '์ตœ๊ณ  ํˆฌํ‘œ์œจ'..๊น€๋™์„ ๋ฏธ์ฃผํ•œ์ธ์œ ๊ถŒ์ž์—ฐ๋Œ€ ๋Œ€ํ‘œ</a>
        #4) a ํƒœ๊ทธ ๋‚ด์šฉ ์ˆ˜์ง‘ 
        links = html.select('a[class="link_txt"]') #list ๋ฐ˜ํ™˜ 
        
        page_news = [] #1page news 
        
        for a in links :
            news = str(a.text).strip() 
            page_news.append(news)
            
        print(page_news)
        
        #1์ผ news ์ถ”๊ฐ€ : 40์ƒ‰์ธ ์ดํ›„ ๊ด€๋ จ ๋‰ด์Šค ์ œ์™ธ      
        day_news.extend(page_news[:40]) #[1page, 2page, 3page, 4page, 5page]
        
    return day_news #1์ผ news ๋‚ด์šฉ


1์ผ news ์ˆ˜์ง‘ ํ…Œ์ŠคํŠธ 

day_news = crawler_fn('20201101')
day_news
len(day_news) #670




3. Crawler ํ•จ์ˆ˜ ํ˜ธ์ถœ 

crawling_data = []
for date in Dates :
    day_news = crawler_fn(date) #1์ผ news - list
    crawling_data.append(day_news) #150์ผ news - list


list ๋‚ดํฌ 

crawling_data = [crawler_fn(date) for date in Dates]

    
crawling_data : [[day1],[day2],....[day150]]

print(crawling_data)

crawling_data[0] #day1 news 
crawling_data[-1] #day150 news 

len(crawling_data) #150

 



4. file save 

import pickle #object -> binary file 

path = r'C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data'

file = open(path + '/news_data.pkl', mode='wb')
pickle.dump(crawling_data, file)
file.close()


file load 

file = open(path + '/news_data.pkl', mode='rb')
news_data = pickle.load(file)
print(news_data)

 

 

 

 

 

 

naver newsCrawling
import urllib.request as req #url ์š”์ฒญ 
from bs4 import BeautifulSoup #html ํŒŒ์‹ฑ 
import re #์ •๊ทœํ‘œํ˜„์‹ : url ์ •์ œ 
import pandas as pd #DataFrame(title + contents)

 


1. base url : naver ๋‰ด์Šค ๊ฒ€์ƒ‰ 
naver.com -> [๋‰ด์Šค]ํด๋ฆญ -> ๊ฒ€์ƒ‰์–ด ์ž…๋ ฅ(๋Œ€ํ†ต๋ น์„ ๊ฑฐ) -> [์˜ต์…˜] -> ๊ธฐ๊ฐ„:6๊ฐœ์›” 
-> 1page ํด๋ฆญ

https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=18&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start=1

 

-> 2page ํด๋ฆญ 
https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=53&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start=11 

 

-> 3page ํด๋ฆญ 
https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=106&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start=21

query=๊ฒ€์ƒ‰์–ด 
sort=์ •๋ ฌ๋ฐฉ์‹(0,1,2)
ds=๊ฒ€์ƒ‰ ์‹œ์ž‘์ผ 
de=๊ฒ€์ƒ‰ ์ข…๋ฃŒ์ผ 
start=์ˆ˜์ง‘ํ•  ํŽ˜์ด์ง€๋ฒˆํ˜ธ(1page์— news๋งํฌ 10๊ฐœ ํฌํ•จ) 
ex) 1page -> start=1, 2page -> start=11, 3page -> start=21

 

page๋ฒˆํ˜ธ vs start๋ณ€์ˆ˜ 

pages = 10 #1~10ํŽ˜์ด์ง€ 
max_pages = (pages-1) * 10 + 1 #21

for start in range(1, max_pages+1, 10) : #1~21, step=10
    print(start)

1 -> 1
2 -> 11
3 -> 21
 :
10 -> 91      

 


2. url ์ถ”์ถœ : ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๋งํฌ ์ˆ˜์ง‘ 

def extract_url(pages) : #ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ 
    max_pages = (pages-1) * 10 + 1
    
    news_urls = [] #๋„ค์ด๋ฒ„ ๋‰ด์Šค ๋งํฌ ์ €์žฅ 
    for start in range(1, max_pages+1, 10) :
        # 1) base url ๊ตฌ์„ฑ 
        url = f"https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%8C%80%ED%86%B5%EB%A0%B9%EC%84%A0%EA%B1%B0&sort=0&photo=0&field=0&pd=6&ds=2021.06.02&de=2021.11.29&cluster_rank=431&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:6m,a:all&start={start}"
        print(url)
        
        #2) url ์š”์ฒญ -> html source 
        res = req.urlopen(url)
        src = res.read()
        
        #3) ๋””์ฝ”๋”ฉ & html ํŒŒ์‹ฑ 
        data = src.decode('utf-8')
        html = BeautifulSoup(data, 'html.parser')
        
        #4) url ์ˆ˜์ง‘ 
        '''
        ๋„ค์ด๋ฒ„ ๋‰ด์Šค url ์ˆ˜์ง‘
        <a href='url'> ๋‚ด์šฉ </a>
        '''
        links = html.select("div.news_area > div.news_info > div.info_group > a.info")
        #href ์†์„ฑ ์ถ”์ถœ : url ์ˆ˜์ง‘ 
        urls = []
        for link in links :
            urls.append(link.attrs['href'])
            
        print('urls ์ˆ˜ : ', len(urls)) #urls ์ˆ˜ :  15
        #print(urls) 
        
        #5) url ์„ ์ • : https://news.naver.com
        url_patt = re.compile('^https://news.naver.com')
        page_url = [url for url in urls if url_patt.match(url)]
        
        news_urls.extend(page_url) #๋‹จ์ผlist  
        
    return news_urls

 

ํ•จ์ˆ˜ ํ˜ธ์ถœ 

news_urls = extract_url(10) #1~10 ํŽ˜์ด์ง€ 
len(news_urls) # 60
news_urls



 

3. Crawler ํ•จ์ˆ˜ 

def crawler_fn(news_urls) : 
    titles = [] # ์ œ๋ชฉ 
    conts = [] # ๋‰ด์Šค ๋‚ด์šฉ 
    
    for url in news_urls : 
        #1) url ์š”์ฒญ 
        res = req.urlopen(url)
        src = res.read()
        
        #2) ๋””์ฝ”๋”ฉ & html ํŒŒ์‹ฑ 
        data = src.decode('euc-kr')
        html = BeautifulSoup(data, 'html.parser')
        
        # 3) ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ ์ˆ˜์ง‘ 
        title = html.select_one('div.article_info > h3#articleTitle')
        titles.append(str(title.text).strip())
        
        article = html.select_one('div#articleBody > div#articleBodyContents')
        conts.append(str(article.text).strip())
        #print('title :', titles)
        #print('conts :', conts)
        
    return titles, conts

 

news ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ ์ˆ˜์ง‘ 

titles, conts = crawler_fn(news_urls)

len(titles) #60 
len(conts) #60

 

 


4. csv file save 

naver_news = pd.DataFrame({'title' : titles, 'contents': conts},
             columns = ['title', 'contents'])

naver_news.info()

path = r'C:\ITWILL\4_Python-2\workspace\chap10_TextMining\data'
naver_news.to_csv(path + '/naver_news.csv', index=None)