爬虫主要运行时间消耗是请求网页时的 IO 阻塞,使用多线程能够让不同请求的等待同时进行,从而提高爬虫运行效率。
基于多线程(这里开启了10个线程),使用 github 的 api,抓取 fork cpython 项目的信息,并将数据存储到 json 文件中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
| import requests import time from threading import Thread from queue import Queue import json
def run_time(func): def wrapper(*args, **kw): start = time.time() func(*args, **kw) end = time.time() print('running', end-start, 's') return wrapper
class Spider():
def __init__(self): self.qurl = Queue() self.data = list() self.email = 'xxx' self.password = 'xxx' self.page_num = 120 self.thread_num = 10
def produce_url(self): baseurl = 'https://api.github.com/repos/python/cpython/forks?page={}' for i in range(1, self.page_num + 1): url = baseurl.format(i) self.qurl.put(url)
def get_info(self): while not self.qurl.empty(): url = self.qurl.get() print('crawling', url) req = requests.get(url, auth = (self.email, self.password)) data = req.json() for datai in data: result = { 'project_name': datai['full_name'], 'project_url': datai['html_url'], 'project_api_url': datai['url'], 'star_count': datai['stargazers_count'] } self.data.append(result)
@run_time def run(self): self.produce_url()
ths = [] for _ in range(self.thread_num): th = Thread(target=self.get_info) th.start() ths.append(th) for th in ths: th.join()
s = json.dumps(self.data, ensure_ascii=False, indent=4) with open('github_thread.json', 'w', encoding='utf-8') as f: f.write(s)
print('Data crawling is finished.')
if __name__ == '__main__': Spider().run()
|
参考:
https://juejin.im/post/5b0951ab51882538ac1ce3c8