๊ฐœ์ธ๊ณต๋ถ€/Python

98. ํŒŒ์ด๋„ ํ”„๋กœ์ ํŠธ (3)๋„ค์ด๋ฒ„ ๋‰ด์Šค ํฌ๋กค๋Ÿฌ ๋งŒ๋“ค๊ธฐ

LEE_BOMB 2022. 1. 6. 21:32

naver ๋‰ด์Šค ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ ํŠน์„ฑ  
naver์—์„œ๋Š” 1๊ฐœ ํ‚ค์›Œ๋“œ ๋‹น ์ตœ๋Œ€ 4,000๊ฐœ ๊นŒ์ง€ ์ œ๊ณตํ•˜๋ฏ€๋กœ ํฌ๋กค๋งํ•  ์ตœ๋Œ€ ํŽ˜์ด์ง€๋ฅผ 1~400 ํŽ˜์ด์ง€๊นŒ์ง€ ์ง€์ • ๊ฐ€๋Šฅ

 

 

 

 

0. ํŒจํ‚ค์ง€ ์ž„ํฌํŠธ

import urllib.request as req
from bs4 import BeautifulSoup
import re
import pandas as pd



 

1. url ์ถ”์ถœ : news ๋งํฌ ์ˆ˜์ง‘ ํ•จ์ˆ˜ ์ƒ์„ฑ

def extract_url(pages): #ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ      
    maxPages =(pages-1)*10+1 #pages=10 -> start=91
    news_urls = [] #๊ฐ page url ์ €์žฅ 
    
    page=1 # ํŽ˜์ด์ง€ ์ˆ˜ ์นด์šดํŠธ 
    for start in range(1, maxPages+1, 10) : #1, 11, 21, ... 91
        #1) url ๊ตฌ์„ฑ
        url = f"๋งํฌ ์ฃผ์†Œ={start}"
        print('page =', page)
        page += 1
        
        #2) url ์š”์ฒญ -> html source          
        res = req.urlopen(url)
        data = res.read()
               
        #3) html ํŒŒ์‹ฑ
        src = data.decode('utf-8') 
        html = BeautifulSoup(src, 'html.parser')     
        links = html.select('div.news_area > div.news_info > div.info_group > a.info')
         
        #4) url ์ˆ˜์ง‘  
        urls = [link.attrs['href'] for link in links] 
        
        #5) url ์„ ์ • : ๋„ค์ด๋ฒ„ ๋‰ด์Šค url๋งŒ ์ถ”์ถœ 
        url_pattern = re.compile('^https://news.naver.com')

        page_urls = [url  for url in urls if url_pattern.match(url)]                
        news_urls.extend(page_urls) # page๋‹น url save 
        print('์ถ”์ถœ url ๊ฐœ์ˆ˜ =', len(page_urls)) #ํ•ด๋‹น ํŽ˜์ด์ง€์—์„œ ํฌ๋กค๋งํ•œ url ๊ฐœ์ˆ˜ 
        
    return news_urls

 

 


2. url ์ถ”์ถœ & ํ™•์ธ 

news_urls = extract_url(400) #100ํŽ˜์ด์ง€๋งŒ ํฌ๋กค๋ง(์ตœ๋Œ€ 400 ํŽ˜์ด์ง€๊นŒ์ง€ ๊ฐ€๋Šฅ) 
print(news_urls) #๋„ค์ด๋ฒ„ ๋‰ด์Šค url ํ™•์ธ 
len(news_urls) #์ „์ฒด url ๊ฐœ์ˆ˜




3. Crawler ํ•จ์ˆ˜(ํŽ˜์ด์ง€, ๊ฒ€์ƒ‰๋‚ ์งœ) 

def crawler_func(news_urls): #crawling url 
    titles = [] #์ œ๋ชฉ 
    conts = [] #๋‰ด์Šค ๋‚ด์šฉ 
    for url in news_urls : #1page๋‹น url
        #1) url ์š”์ฒญ -> html source   
        res = req.urlopen(url)
        data = res.read()
        #print(data) # <meta charset="euc-kr">
        
        #2) html ํŒŒ์‹ฑ
        try : # decode ๋ฐ ํƒœ๊ทธ ์—†๋Š” ๊ฒฝ์šฐ
            src = data.decode('euc-kr') 
            html = BeautifulSoup(src, 'html.parser')  
            #select_one : 1๊ฐœ tag ๋Œ€์ƒ 
            title = html.select_one('div.article_info > h3#articleTitle') 
            titles.append(str(title.text).strip())
                            
            article = html.select_one('div[id="articleBody"] > div[id="articleBodyContents"]') 
            conts.append(str(article.text).strip())#tag ํ…์ŠคํŠธ ์ˆ˜์ง‘

           #print(titles)
           #print(conts)
        except Exception as e:
            print('์˜ˆ์™ธ๋ฐœ์ƒ : ', e)
            print('url :', url)
    return titles, conts



    
4. ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ ์ˆ˜์ง‘ 

titles, conts = crawler_func(news_urls) 
len(titles) #์ œ๋ชฉ ์ˆ˜ 
len(conts) #๋‚ด์šฉ ์ˆ˜ 
print(conts[0]) #news ๋‚ด์šฉ์ค‘์—์„œ ์ฒซ๋ฒˆ์งธ์— ๋‚˜ํƒ€๋‚œ ์•„๋ž˜ ๋ฌธ์žฅ์€ ์ œ์™ธ ์ œ๊ฑฐํ•จ

 

// flash ์˜ค๋ฅ˜๋ฅผ ์šฐํšŒํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ถ”๊ฐ€\nfunction _flash_removeCallback() {}์ค„๋ฐ”๊ฟˆ3๊ฐœ 

#์ฒซ๋ฒˆ์งธ ๋‚˜ํƒ€๋‚œ ๋ฌธ์žฅ ์ œ์™ธ
remove_str ="// flash ์˜ค๋ฅ˜๋ฅผ ์šฐํšŒํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ถ”๊ฐ€\nfunction _flash_removeCallback\(\) \{\}\n\n\n"
contents = [re.sub(remove_str,"", cont) for cont in conts]
contents[0] #๋ฌธ์žฅ ์ œ์™ธ๋จ

 

 


5. csv file save

naver_news = pd.DataFrame({'titles':titles, 'contents':contents}, 
                       columns = ['titles', 'contents'])
naver_news.info()
print(naver_news.head())

path = r'D:\ITWILL'
naver_news.to_csv(path+'/์˜ํ™”๋ช…_news.csv', index=None, encoding = "utf-8-sig")