import requests
class TiebaSpider:
"""贴吧爬虫"""
def __init__(self, keywords):
# 贴吧名称
self.kw = keywords
# 目标地址
self.url = "https://tieba.baidu.com/f?ie=utf-8"
# 伪装请求
self.headers = {
"User-Agent": "请求头"
}
def get_data(self, start_page, end_page):
"""
采集数据
:param start_page: 采集数据的起始页面
:param end_page: 采集数据的结束页面
:return: 返回采集结果
"""
for i in range(start_page, end_page + 1):
# 设置参数
ps = {"kw": self.kw, "pn": ((i-1) * 50)}
# 发送请求获取数据: get请求后拼接参数数据
response = requests.get(self.url, params=ps, headers=self.headers)
# 存储数据
file_name = f"tieba_{i}.html"
self._save_data(file_name, response.content)
def _save_data(self, file_name, content):
"""存储数据"""
with open(f"data/{file_name}", mode="wb") as file:
file.write(content)
if __name__ == "__main__":
# 创建爬虫对象
tb = TiebaSpider("王者荣耀")
# 获取数据
tb.get_data(1, 2)