DAY36. Python ์ ๊ทํํ์
์ ๊ท ํํ์(Regular Expressions)
ํน์ ํ ๊ท์น์ ๊ฐ์ง ๋ฉํ๋ฌธ์๋ฅผ ์ด์ฉํ์ฌ ํจํด์ ์ง์ ํ ๋ฌธ์์ด ํํ
[์ฃผ์ ๋ฉํ๋ฌธ์]
.x : ์์์ ํ ๋ฌธ์ ๋ค์ x๊ฐ ์ค๋ ๋ฌธ์์ด(ex : abc, mbc -> .bc)
^x : x๋ก ์์ํ๋ ๋ฌธ์์ด(์ ๋์ด ์ถ์ถ)
x$ : x๋ก ๋๋๋ ๋ฌธ์์ด(์ ๋ฏธ์ด ์ถ์ถ)
x. : x ๋ค์์ ์์์ ํ ๋ฌธ์๊ฐ ์ค๋ ๋ฌธ์์ด(ex : t1, t2, ta -> t.)
x* : x๊ฐ 0๋ฒ ์ด์ ๋ฐ๋ณต(์๋ ๊ฒฝ์ฐ ํฌํจ)
x+ : x๊ฐ 1ํ ์ด์ ๋ฐ๋ณต
x? : x๊ฐ 0 ~ 1ํ ์กด์ฌ
x{m, n} : x๊ฐ m~n ์ฌ์ด ์ฐ์
x{m, } : x๊ฐ m ์ด์ ์ฐ์
x{,n} : x๊ฐ n ์ดํ ์ฐ์
[x] : x๋ฌธ์ ํ ๊ฐ ์ผ์น
| : or ์กฐ๊ฑด์
\ : ์ด์ค์ผ์ดํ ๋ฌธ์๋ฅผ ์ผ๋ฐ๋ฌธ์๋ก ์ธ์
\d : ์ซ์
\w : ๋จ์ด
\s : ๊ณต๋ฐฑ
() : ๊ทธ๋ฃนํ, ์ถ์ถํ ํจํด ์ง์
st1 = '1234 abcํ๊ธธ๋ ABC_555_6 ์ด์ฌ๋์'
st2 = 'test1abcABC 123mbc 45test'
urls = ['http://news.com/a/test', 'new.com','http://news.com/b/test', 'http//~']
st3 = 'test^ํ๊ธธ๋ abc ๋ํ*๋ฏผ๊ตญ 123$tbc'
๋ชจ๋(module) : ํจ์ ๋๋ ํด๋์ค๋ฅผ ํฌํจํ ํ์ด์ฌ ํ์ผ (*.py)
์ค์น ๊ฒฝ๋ก : C:/Users/KIM YOON/anaconda3/Lib/re.py
์ ๊ทํํ์๊ณผ ๋ฌธ์์ด ์ฒ๋ฆฌ ํจ์ ์ ๊ณต ๋ชจ๋(python file)
ํ์) import ๋ชจ๋
import re # ๋ชจ๋(re.py) - ๋ฐฉ๋ฒ1
ํ์) from ๋ชจ๋ import ํจ์1, ํจ์2, ํจ์3, ...
from re import findall, match, sub # ๋ฐฉ๋ฒ2 : ๊ถ์ฅ
1. findall('pattern', string)
ํจํด๊ณผ ์ผ์นํ๋ ๋ฌธ์์ด ์ฐพ๊ธฐ -> list ๋ฐํ
1) ์ซ์ ์ฐพ๊ธฐ
print(re.findall('1234', st1)) # ['1234'] : ๋ฐฉ๋ฒ1
print(findall('1234', st1)) # ['1234'] : ๋ฐฉ๋ฒ2
print(findall('[0-9]', st1)) # ['1', '2', '3', '4', '5', '5', '5', '6']
print(findall('[0-9]{3}', st1)) # ['123', '555']
print(findall('[0-9]{3,}', st1)) # ['1234', '555']
print(findall('[0-9]{3,4}', st1)) # ['1234', '555']
print(findall(r'\d{3,4}', st1)) # ['1234', '555']
2) ๋ฌธ์์ด ์ฐพ๊ธฐ
findall('[๊ฐ-ํฃ]{3,}', st1) # ['ํ๊ธธ๋', '์ด์ฌ๋์']
findall('[a-z]{3}', st1) # ['abc']
findall('[a-z|A-Z]{3}', st1) # ['abc', 'ABC']
findall('[a-z]{4}', st1) # [] null๊ฐ
words = st1.split() # ๊ณต๋ฐฑ ๊ธฐ์ค ํ ํฐ ์์ฑ
print(words) # ['1234', 'abcํ๊ธธ๋', 'ABC_555_6', '์ด์ฌ๋์']
names = [] # ํ๊ธ ์ด๋ฆ ์ ์ฅ
for word in words :
result = findall('[๊ฐ-ํฃ]{3,}', word) # '1234'
print(result) # [], ['ํ๊ธธ๋']
if result : # False(null) or True(not null)
#names.append(result) # ์ค์ฒฉ list
names.extend(result) # ๋จ์ผ list
print(names) # [['ํ๊ธธ๋'], ['์ด์ฌ๋์']] -> ['ํ๊ธธ๋', '์ด์ฌ๋์']
3) ์ ๋์ด/์ ๋ฏธ์ด ๋ฌธ์์ด ์ฐพ๊ธฐ
st2 = 'test1abcABC 123mbc 45test'
findall('^test', st2) # ['test']
findall('^text', st2) # []
findall('test$', st2) # ['test']
abc, mbc
findall('.bc' , st2) # ['abc', 'mbc']
urls = ['http://news.com/a/test', 'new.com','http://news.com/b/test', 'http//~']
print(urls)
#['http://news.com/a/test', 'new.com', 'http://news.com/b/test', 'http//~']
len(urls) # 4
urls_re = [] # ์ ์ url ์ ์ฅ
for url in urls :
if findall('^http://news.com', url) : # True == not null
print(url) # ์ ์์ ์ธ url ์ถ๋ ฅ
urls_re.append(url)
http://news.com/a/test
http://news.com/b/test
print(urls_re) # ['http://news.com/a/test', 'http://news.com/b/test']
4) ๋จ์ด(\w) ์ฐพ๊ธฐ : ๋จ์ด(ํ๊ธ, ์๋ฌธ, ์ซ์), ๋จ์ด ์๋(ํน์๋ฌธ์,๋ฌธ์ฅ๋ถํธ,๊ณต๋ฐฑ)
st3 = 'test^ํ๊ธธ๋ abc ๋ํ*๋ฏผ๊ตญ 123$tbc'
findall(r'\w{3,}', st3) # 3์์ ์ด์ ๋จ์ด ์ฐพ๊ธฐ
# ['test', 'ํ๊ธธ๋', 'abc', '123', 'tbc']
5) ๋ฌธ์์ด ์ ์ธ : [^์ ์ธ๋ฌธ์] -> [^t]
findall('[^t]+', st3) #['es', '^ํ๊ธธ๋ abc ๋ํ*๋ฏผ๊ตญ 123$', 'bc']
# ํด๋น ๋ฌธ์ ์ ์ธํ ๋๋จธ์ง ๋ฌธ์ 1๊ฐ ์ด์ ์ฐ์
ํน์๋ฌธ์ ์ ์ธ : ^ * $
findall('[^^*$]+', st3) #['test', 'ํ๊ธธ๋ abc ๋ํ', '๋ฏผ๊ตญ 123', 'tbc']