• Python爬虫_百度贴吧


    # 本爬虫为爬取百度贴吧并存储HTML

    import
    requests class TiebaSpider: def __init__(self, tieba_name): self.tieba_name = tieba_name self.url = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}" self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"} def get_url_list(self): #构造url列表 url_list = [self.url.format(i*50) for i in range(1000)] return url_list def parse_url(self, url): #发送请求 获取响应 res = requests.get(url, headers=self.headers) return res.content.decode() def save_html(self, html_str, page_num): #保存 file_path = "{}-第{}页.html".format(self.tieba_name, page_num) with open(file_path, "w", encoding="utf-8") as f: f.write(html_str) def run(self): # 实现主要逻辑 # 1 构造url列表 url_list = self.get_url_list() # 2 遍历 发送请求 获取响应 for url in url_list: html_str = self.parse_url(url) # 3 保存 page_num = url_list.index(url)+1 self.save_html(html_str, page_num) print(url) if __name__ == "__main__": tieba_spider = TiebaSpider("lol") tieba_spider.run()
  • 相关阅读:
    预处理器&预处理变量&头文件保护&条件编译
    Xctf攻防世界—crypto—Normal_RSA
    RSA共模攻击
    centos7安装宝塔面板
    cobalt strike出现连接超时情况解决办法
    C语言变量
    Hello World!
    ctfshow—web—web7
    ctfshow—web—web6
    ctfshow—web—web5
  • 原文地址:https://www.cnblogs.com/waterr/p/13893578.html
Copyright © 2020-2023  润新知