반응형
<출처> https://github.com/netscout/chocopy/tree/master/How-to/web_crawling
파이썬, 셀레니움 크롤링 예제들
예제 1
import requests as req
import bs4
url = "https://www.datacamp.com/tracks/machine-learning-scientist-with-python"
res = req.get(url)
bs = bs4.BeautifulSoup(res.text, features="html.parser")
courses = bs.select("#gatsby-focus-wrapper > div > div.container.css-93pq91 > div.col-md-8 > div > div > div > div.css-2cldv8 > a")
courseList = []
for c in courses:
link = c.attrs["href"]
title = c.select_one("h4").getText().strip()
desc = c.select_one("p").getText().strip()
courseList.append({"link": link, "title": title, "desc": desc})
print(courseList)
예제 2
from selenium import webdriver
import bs4
url ="https://www.datacamp.com/tracks/machine-learning-scientist-with-python"
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(url)
btn = driver.find_element_by_xpath("""//*[@id="gatsby-focus-wrapper"]/div/div[1]/div[1]/div/div/div[4]/button""")
btn.click()
bs = bs4.BeautifulSoup(driver.page_source, features="html.parser")
courses = bs.select("#gatsby-focus-wrapper > div > div.container.css-93pq91 > div.col-md-8 > div > div > div > div.css-2cldv8 > a")
courseList = []
for c in courses:
link = c.attrs["href"]
title = c.select_one("h4").getText().strip()
desc = c.select_one("p").getText().strip()
courseList.append({"link": link, "title": title, "desc": desc})
print(len(courseList))
예제 3
from selenium import webdriver
import bs4
base_url = "https://www.datacamp.com"
url = f"{base_url}/tracks/machine-learning-scientist-with-python"
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(url)
btn = driver.find_element_by_xpath("""//*[@id="gatsby-focus-wrapper"]/div/div[1]/div[1]/div/div/div[4]/button""")
btn.click()
bs = bs4.BeautifulSoup(driver.page_source, features="html.parser")
courses = bs.select("#gatsby-focus-wrapper > div > div.container.css-93pq91 > div.col-md-8 > div > div > div > div.css-2cldv8 > a")
courseList = []
for c in courses:
link = c.attrs["href"]
title = c.select_one("h4").getText().strip()
desc = c.select_one("p").getText().strip()
courseList.append({"link": link, "title": title, "desc": desc})
for c in courseList:
driver.get(f"{base_url}{c['link']}")
bs_detail = bs4.BeautifulSoup(driver.page_source, features="html.parser")
chapters = bs_detail.select_one("ol.chapters")
chapters_elem = chapters.select("li.chapter")
chapter_list = []
for chap in chapters_elem:
chap_title = chap.select_one("h4.chapter__title").getText().strip()
chap_desc = chap.select_one("p.chapter__description").getText().strip()
chap_details_elem = chap.select("h5.chapter__exercise-title")
chap_detail_titles = []
for cd in chap_details_elem:
cd_title = cd.getText().strip()
chap_detail_titles.append(cd_title)
chapter_detail = {"title":chap_title, "desc":chap_desc, "details": chap_detail_titles}
#print(chapter_detail)
chapter_list.append(chapter_detail)
c["chapter_detail"] = chapter_list
print(courseList)
print(len(courseList))
예제 4
from selenium import webdriver
import bs4
import openpyxl as xl
base_url = "https://www.datacamp.com"
url = f"{base_url}/tracks/machine-learning-scientist-with-python"
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(url)
btn = driver.find_element_by_xpath("""//*[@id="gatsby-focus-wrapper"]/div/div[1]/div[1]/div/div/div[4]/button""")
btn.click()
bs = bs4.BeautifulSoup(driver.page_source, features="html.parser")
courses = bs.select("#gatsby-focus-wrapper > div > div.container.css-93pq91 > div.col-md-8 > div > div > div > div.css-2cldv8 > a")
courseList = []
for c in courses:
link = c.attrs["href"]
title = c.select_one("h4").getText().strip()
desc = c.select_one("p").getText().strip()
courseList.append({"link": link, "title": title, "desc": desc})
for c in courseList:
driver.get(f"{base_url}{c['link']}")
bs_detail = bs4.BeautifulSoup(driver.page_source, features="html.parser")
chapters = bs_detail.select_one("ol.chapters")
chapters_elem = chapters.select("li.chapter")
chapter_list = []
for chap in chapters_elem:
chap_title = chap.select_one("h4.chapter__title").getText().strip()
chap_desc = chap.select_one("p.chapter__description").getText().strip()
chap_details_elem = chap.select("h5.chapter__exercise-title")
chap_detail_titles = []
for cd in chap_details_elem:
cd_title = cd.getText().strip()
chap_detail_titles.append(cd_title)
chapter_detail = {"title":chap_title, "desc":chap_desc, "details": chap_detail_titles}
print(chapter_detail)
chapter_list.append(chapter_detail)
c["chapter_detail"] = chapter_list
wb = xl.Workbook()
sheet = wb.active
sheet.title = "Machine Learning Scientist with Python"
row = 2
col = 1
for c in courseList:
sheet.cell(row=row, column=col).value = c["title"]
sheet.cell(row=row, column=col+1).value = c["desc"]
row += 1
for cd in c["chapter_detail"]:
sheet.cell(row=row, column=col+1).value = cd["title"]
sheet.cell(row=row, column=col+2).value = cd["desc"]
row += 1
for cd_title in cd["details"]:
sheet.cell(row=row, column=col+2).value = cd_title
row += 1
wb.save("Machine Learning Scientist with Python.xlsx")
예제 5
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import bs4
url ="https://www.datacamp.com/tracks/machine-learning-scientist-with-python"
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.XPATH, """//*[@id="gatsby-focus-wrapper"]/div/div[1]/div[1]/div/div/div[4]/button""")))
btn = driver.find_element_by_xpath("""//*[@id="gatsby-focus-wrapper"]/div/div[1]/div[1]/div/div/div[4]/button""")
btn.click()
bs = bs4.BeautifulSoup(driver.page_source, features="html.parser")
courses = bs.select("#gatsby-focus-wrapper > div > div.container.css-93pq91 > div.col-md-8 > div > div > div > div.css-2cldv8 > a")
courseList = []
for c in courses:
link = c.attrs["href"]
title = c.select_one("h4").getText().strip()
desc = c.select_one("p").getText().strip()
courseList.append({"link": link, "title": title, "desc": desc})
print(len(courseList))
반응형
'웹 크롤링, 스크래핑' 카테고리의 다른 글
파이썬 셀레니움, 열린 경고창 닫기 (0) | 2022.07.31 |
---|---|
파이썬 셀레니움, 같은 웹브라우저에서 다른 탭으로 url 열기 (0) | 2022.07.30 |
파이썬, 크롤링 스레드 예제 (BeautifulSoup, ThreadPoolExecutor) (0) | 2022.07.30 |
비동기로 여러 사이트 접속하여 HTML 가져와서 파일 저장하기 (0) | 2022.07.30 |
웹 사이트 HTML 코드를 다운로드하여 html 파일로 저장하기 (1) | 2022.07.30 |