๋ฐ์ํ
Selenium์ WebDriver๋ฅผ ์ฌ์ฉํด ํฌ๋กค๋งํ๊ธฐ
# ์ถ๊ฐ ํจํค์ง ์ค์น
!pip install supabase # ์ํ๋ฒ ์ด์ค SDK ์ค์น
!pip install selenium # ํค๋๋ฆฌ์ค ๋ธ๋ผ์ฐ์ ๋ฅผ ์ํ ํ
์คํธ ์๋ํ ํด
!pip install beautifulsoup4 # html ํ์ฑ ํด
- ํฌ๋กฌ ํ๋ฉด ์ฐ์๋จ ... ๋ฉ๋ด ๋ฒํผ ํด๋ฆญ → ์ค์ → ํ๋จ Chrome ์ ๋ณด ํด๋ฆญ
- ํฌ๋กฌ ๋๋ผ์ด๋ฒ ๋ค์ด๋ก๋
- ์๋ ์ฝ๋๋ก ์์ ์ ์ด์์ฒด์ ๋ฐ ์ํคํ ์ฒ ํ์ธ
- https://googlechromelabs.github.io/chrome-for-testing/#stable
Chrome for Testing availability
chrome-headless-shellmac-arm64https://storage.googleapis.com/chrome-for-testing-public/124.0.6367.118/mac-arm64/chrome-headless-shell-mac-arm64.zip200
googlechromelabs.github.io
import platform
import sys, os, requests, zipfile
# ์ด์์ฒด์ ๋ฐ ์ํคํ
์ฒ ํ์ธ
os_name = platform.system().lower()
architecture = platform.machine()
if os_name == 'darwin':
if architecture == 'arm64':
print("์ด์์ฒด์ : macOS, ์ํคํ
์ฒ: ARM64")
elif architecture == 'x86_64':
print("์ด์์ฒด์ : macOS, ์ํคํ
์ฒ: x64")
elif os_name == 'windows':
if sys.maxsize > 2**32:
print("์ด์์ฒด์ : Windows, ์ํคํ
์ฒ: 64-bit")
else:
print("์ด์์ฒด์ : Windows, ์ํคํ
์ฒ: 32-bit")
else:
print(f"์ด์์ฒด์ : {os_name}, ์ํคํ
์ฒ: {architecture}")
chrome_driver_url = 'https://storage.googleapis.com/chrome-for-testing-public/124.0.6367.91/linux64/chromedriver-linux64.zip'
- ํฌ๋กฌ ๋๋ผ์ด๋ฒ ์ค์น ๋ฐ ์๋ ํ์ธ
# ๋ค์ด๋ก๋
os.makedirs('./driver', exist_ok=True)
with requests.get(chrome_driver_url) as response:
with open('./driver/chromedriver.zip', 'wb') as file:
file.write(response.content)
# ์์ถํด์
with zipfile.ZipFile('./driver/chromedriver.zip') as zip_ref:
zip_ref.extractall('./driver')
os.remove('./driver/chromedriver.zip')
from glob import glob
driver_path = None
if os_name == 'darwin': # ๋งฅ ์ฌ์ฉ์
driver_path = glob('./driver/**/chromedriver', recursive=True)[0]
else: # ์๋์ฐ ์ฌ์ฉ์
driver_path = glob('./driver/**/chromedriver', recursive=True)[0]
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
service = Service(executable_path=driver_path)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(service=service, options=chrome_options)
url = 'ํฌ๋กค๋ง ํ url'
driver.get(url)
์์ ๋ฐฉ์๋๋ก ๋ฌธ์ ์์ด ์งํ๋๋ค๋ฉด ํฌ๋กค๋ง์ด ์ฑ๊ณต์ ์ผ๋ก ์งํ๋๋ค. ํ์ง๋ง ๋์ ๊ฒฝ์ฐ ์ ๋ ๋์ ๋ฒ์ ์ ๋ฌธ์ ์ธ์ง ํฌ๋กฌ ๋ธ๋ผ์ฐ์ ์ ๋ฌธ์ ์ธ์ง WebDriver ๊ฐ์ฒด๋ฅผ ๋ง๋ค ๋, Chrome binary๋ฅผ ์ ๋๋ก ์ฐพ์ง ๋ชปํ๋ ์๋ฌ๊ฐ ๋ฐ์ํ๋ค.
์๋ฌ ๋ด์ฉ
WebDriverException ๋ฐ์ ์ - Seleniumbase
์ฐพ์๋ณด๋ selenium์ด ์ต์ ๋ฒ์ ์ผ๋ก ์ ๋ฐ์ดํธ ๋๋ฉด์, seleniumbase ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ด Driver ํด๋์ค๋ฅผ ์ฌ์ฉํด์ผ ์ ์์ ์ผ๋ก ์๋ํ๋ค๊ณ ํ๋ค.
# ํจํค์ง ์ค์น
!pip install seleniumbase
from seleniumbase import Driver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
driver = Driver(browser="chrome", headless=True)
url = 'ํฌ๋กค๋ง ํ url'
driver.get(url)
print(driver) # ์คํ ํ์ธ
# ํ์ฑ
soup = BeautifulSoup(driver.page_source, 'html.parser')
# ํฌ๋กค๋ง์ด ๋๋ํ ๋ฐ๋์ ๋ธ๋ผ์ฐ์ ์์์ ๋ฐ๋ฉํด์ผํจ.
driver.close()
driver.quit()
๋ฐ์ํ