๊ฐœ์ธ๊ณต๋ถ€/Python

77. Python TextMining ์—ฐ์Šต๋ฌธ์ œ(1)

LEE_BOMB 2021. 12. 11. 22:48
๋ฌธ1) member.html ์›น ๋ฌธ์„œ๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๋‹ค์Œ ์กฐ๊ฑด์— ๋งž๊ฒŒ ๋‚ด์šฉ์„ ์ถ”์ถœํ•˜์‹œ์˜ค. 

<์กฐ๊ฑด> <tr> ํƒœ๊ทธ ํ•˜์œ„ ํƒœ๊ทธ์ธ <td> ํƒœ๊ทธ์˜ ๋ชจ๋“  ๋‚ด์šฉ ์ถœ๋ ฅ
<์ถœ๋ ฅ ๊ฒฐ๊ณผ>
์•„์ด๋””  
hong123 
๋น„๋ฐ€๋ฒˆํ˜ธ 
1234    
์ด๋ฆ„   
ํ™๊ธธ๋™

 

from bs4 import BeautifulSoup


1. ํŒŒ์ผ ์ฝ๊ธฐ 

file = open("C:/ITWILL/4_Python-2/workspace/chap10_TextMining/data/member.html", mode='r', encoding='utf-8')
source = file.read()
file.close()



2. html ํŒŒ์‹ฑ

html = BeautifulSoup(source, 'html.parser')



3. ํƒœ๊ทธ ์ฐพ๊ธฐ 

tds = html.find_all('td') #td ํƒœ๊ทธ ์ „์ฒด ์ฐพ๊ธฐ 
print(tds) #list ๋ฐ˜ํ™˜

[<td> ์•„์ด๋””  </td>, <td> hong123 </td>, <td> ๋น„๋ฐ€๋ฒˆํ˜ธ </td>, <td> 1234    </td>, <td> ์ด๋ฆ„   </td>, <td> ํ™๊ธธ๋™ </td>]

 

4. ํƒœ๊ทธ ๋‚ด์šฉ ์ถœ๋ ฅ 

contents = [td.text  for td in tds] #td ํƒœ๊ทธ ๋‚ด์šฉ ์ถ”์ถœ 
print(contents)

print('<์ถœ๋ ฅ๊ฒฐ๊ณผ>')
for c in contents :
    print(c)

 

 

 

 

 

๋ฌธ2) urls์˜ url์„ ๋Œ€์ƒ์œผ๋กœ ๋‹ค์Œ ์กฐ๊ฑด์— ๋งž๊ฒŒ ์›น ๋ฌธ์„œ์˜ ์ž๋ฃŒ๋ฅผ ์ˆ˜์ง‘ํ•˜์‹œ์˜ค.

์กฐ๊ฑด1> http://์œผ๋กœ ์‹œ์ž‘ํ•˜๋Š” url๋งŒ์„ ๋Œ€์ƒ์œผ๋กœ ํ•œ๋‹ค.
์กฐ๊ฑด2> url์— ํ•ด๋‹นํ•˜๋Š” ์›น ๋ฌธ์„œ๋ฅผ ๋Œ€์ƒ์œผ๋กœ <a> ํƒœ๊ทธ(tag) ๋‚ด์šฉ์„ ์ถœ๋ ฅํ•œ๋‹ค.
from urllib.request import urlopen #ํ•จ์ˆ˜ : ์›๊ฒฉ ์„œ๋ฒ„ url ์š”์ฒญ 
from bs4 import BeautifulSoup #ํด๋ž˜์Šค : html ํŒŒ์‹ฑ
import re #์ •๊ทœํ‘œํ˜„์‹

urls = ['http://www.daum.net', 'www.daum.net', 'http://www.naver.com']


๋‹จ๊ณ„1 : url ์ •์ œ

new_urls = []
for url in urls :
    tmp = re.findall('^http://', url)
    if tmp :
        new_urls.append(url)


๋‹จ๊ณ„2 : url์—์„œ a ํƒœ๊ทธ ๋‚ด์šฉ ์ˆ˜์ง‘ & ์ถœ๋ ฅ

for url in new_urls :   
    #1. url ์š”์ฒญ
    print('url :', url)
    req = urlopen(url)
    data = req.read()
    
    #2. html ํŒŒ์‹ฑ 
    src = data.decode('utf-8')
    html = BeautifulSoup(src, 'html.parser')
    
    #3. a ํƒœ๊ทธ ์ฐพ๊ธฐ & ๋‚ด์šฉ 
    a_all = html.find_all('a') #์•ต์ปค ํƒœ๊ทธ ์ „์ฒด ์ฐพ๊ธฐ 
    print('a ํƒœ๊ทธ ์ „์ฒด ๊ฐœ์ˆ˜ :', len(a_all)) #a ํƒœ๊ทธ ์ „์ฒด ๊ฐœ์ˆ˜ : 414
    
    for a in a_all : 
        print(a.text) #tag ๋‚ด์šฉ ์ถœ๋ ฅ

 

 

 

 

 

๋ฌธ3) ์•„๋ž˜ url์„ ์ด์šฉํ•˜์—ฌ ์–ด๋ฆฐ์ด๋‚ (20210505)์— ์ œ๊ณต๋œ ๋‰ด์Šค ๊ธฐ์‚ฌ๋ฅผ 1~5ํŽ˜์ด์ง€ ํฌ๋กค๋งํ•˜๋Š” ํฌ๋กค๋Ÿฌ ํ•จ์ˆ˜๋ฅผ ์ •์˜ํ•˜๊ณ  ํฌ๋กค๋ง ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•˜์‹œ์˜ค.
base_url = "https://news.daum.net/newsbox?regDate="   
   
<์กฐ๊ฑด1> ํฌ๋กค๋Ÿฌ ํ•จ์ˆ˜์˜ ํŒŒ๋ผ๋ฏธํ„ฐ(page๋ฒˆํ˜ธ, ๋‚ ์งœ)
<์กฐ๊ฑด2> ํฌ๋กค๋ง ๋Œ€์ƒ  : <a> ํƒœ๊ทธ์˜ 'class=link_txt' ์†์„ฑ์„ ๊ฐ–๋Š” ๋‚ด์šฉ 
<์กฐ๊ฑด3> ํฌ๋กค๋ง ๊ฒฐ๊ณผ ํ™•์ธ  : news ๊ฐœ์ˆ˜์™€  news ์ถœ๋ ฅ  
import urllib.request as req  # url ๊ฐ€์ ธ์˜ค๊ธฐ 
from bs4 import BeautifulSoup


ํด๋กœ๋Ÿฌ ํ•จ์ˆ˜(ํŽ˜์ด์ง€์ˆ˜, ๊ฒ€์ƒ‰๋‚ ์งœ) 

def crawler_func(pages, date):
    base_url = "https://news.daum.net/newsbox?regDate="
    crawling_news = [] #5 page news ์ €์žฅ

    url = base_url + date  
    #url = https://news.daum.net/newsbox?regDate=20210505
    
    #page ๋‹จ์œ„ news ์ˆ˜์ง‘ 
    for page in range(1, pages+1) :  #1~5 page
        p = '&page=' + str(page)
        url += p #url = url + p 
                
        #url = https://news.daum.net/newsbox?regDate=20210505&page=1
        
        #1. url ์š”์ฒญ 
        res = req.urlopen(url)    
        src = res.read() #source 
        data = src.decode('utf-8') #๋””์ฝ”๋”ฉ ์ ์šฉ 
        
        #2.html ํŒŒ์‹ฑ  
        html = BeautifulSoup(data, 'html.parser')
        
        #3. tag ์š”์†Œ ์ถ”์ถœ     
        #1) tag element ์ˆ˜์ง‘ 
        a_tag = html.select('a[class="link_txt"]')
        
        #2) ์ž๋ฃŒ ์ˆ˜์ง‘ 
        page_news = [] #1 page news ์ €์žฅ     
        for a in a_tag :
            cont = str(a.string) #๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ -> ๋ฌธ์ž์—ด         
            page_news.append(cont.strip())
        
        crawling_news.extend(page_news[:40]) #5 page news ์ €์žฅ
            
    return crawling_news


ํด๋กœ๋Ÿฌ ํ•จ์ˆ˜ ํ˜ธ์ถœ 

crawling_news = crawler_func(5, '20210505') #(ํŽ˜์ด์ง€์ˆ˜, ๊ฒ€์ƒ‰๋‚ ์งœ)

print('ํฌ๋กค๋ง news ๊ฐœ์ˆ˜ =', len(crawling_news)) #200=5*40
print('ํฌ๋กค๋ง news') 
print(crawling_news)