반응형
<출처> https://github.com/netscout/chocopy/blob/master/How-to/web_crawling/crawling01.py
파이썬, 크롤링 스레드 예제 (BeautifulSoup, ThreadPoolExecutor)
# 1
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from concurrent.futures import ThreadPoolExecutor
def get_links():
curr_list = 'https://en.wikipedia.org/wiki/List_of_circulating_currencies'
all_links = []
response = requests.get(curr_list)
soup = BeautifulSoup(response.text, "lxml")
curr_el = soup.select('p+table td:nth-child(2) > a, p+table td:nth-child(1) > a:nth-child(1)')
for link_el in curr_el:
link = link_el.get("href")
link = urljoin(curr_list, link)
all_links.append(link)
return all_links
def fetch(link):
response = requests.get(link)
with open("./output/"+link.split("/")[-1]+".html", "wb") as f:
f.write(response.content)
print('.',end='',flush=True)
if __name__ == '__main__':
links = get_links()
print(f"Total pages: {len(links)}")
start_time = time.time()
# Unoptimzed
# for link in links:
# fetch(link)
# Threading
with ThreadPoolExecutor(max_workers=100) as executor:
executor.map(fetch, links)
duration = time.time() - start_time
print(f"Downloaded {len(links)} links in {duration} seconds")
# 8 -> 16
# 100 -> 1.75
# 2
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
def get_links():
curr_list = 'https://en.wikipedia.org/wiki/List_of_circulating_currencies'
all_links = []
response = requests.get(curr_list)
soup = BeautifulSoup(response.text, "lxml")
curr_el = soup.select('p+table td:nth-child(2) > a, p+table td:nth-child(1) > a:nth-child(1)')
for link_el in curr_el:
link = link_el.get("href")
link = urljoin(curr_list, link)
all_links.append(link)
return all_links
def fetch(link):
response = requests.get(link)
with open("./output/"+link.split("/")[-1]+".html", "wb") as f:
f.write(response.content)
print('.',end='',flush=True)
if __name__ == '__main__':
links = get_links()
print(f"Total pages: {len(links)}")
start_time = time.time()
with Pool(cpu_count()) as p:
p.map(fetch, links)
duration = time.time() - start_time
print(f"Downloaded {len(links)} links in {duration} seconds")
#18.11
Footer
© 2022 GitHub, Inc.
Footer navigation
Terms
Privacy
반응형
'웹 크롤링, 스크래핑' 카테고리의 다른 글
파이썬 셀레니움, 같은 웹브라우저에서 다른 탭으로 url 열기 (0) | 2022.07.30 |
---|---|
파이썬, 셀레니움 크롤링 예제들 (0) | 2022.07.30 |
비동기로 여러 사이트 접속하여 HTML 가져와서 파일 저장하기 (0) | 2022.07.30 |
웹 사이트 HTML 코드를 다운로드하여 html 파일로 저장하기 (1) | 2022.07.30 |
셀레니움, Firefox 웹드라이버 다운로드 사이트 (0) | 2022.07.28 |