• 简单爬虫


    import requests
    # requests模块,发出请求,接受响应,包括请求响应

    headers = {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
    }

    params = dict(wd="python")


    r = requests.get("http://www.baidu.com/s",headers=headers,params=params)

    print(r.content.decode())
      #获取请求的url
    print(r.request.url)


    import requests

            # "发送请求"
    r = requests.get("http://www.baidu.com")
    # "有乱码,requests模块是猜 对方web响应的返回的编码格式进行解码的"
    print(r.text)
    # 此时获取的内容比较少。因为此时还没有设置user——agent ,对方会认为时一个爬虫,有些东西不让爬
    # 打印编码格式
    # print(r.encoding)

    # r.content返回的是bite格式的数据,所以直接用 r.content.decode

    # 修改解码方式
    # r.encoding = "utf8"
    # print(r.text)

     

    # 打印请求头
    # print(r.request.headers)
    #打印响应头
    print(r.headers)


     

    import requests

     

    class TiebaSpider:

        def __init__(self,tieba_name):

          self.tieba_name = tieba_name

          self.temp_url = "https://tieba.baidu.com/f?kw=" + tieba_name + "&pn={}"

          self.headers = {
              "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko)           Chrome/60.0.3112.113 Safari/537.36"
              }

     

    #构造url列表
        def get_url_list(self):

          url_list = [self.temp_url.format(i*50) for i in range(5)]
          return url_list

     

    #发送请求,获取响应
        def parse_url(self,url):

          response = requests.get(url,headers=self.headers)
          return response.content.decode()

     

    #保存
        def save_html(self,html,page_num):

          file_path = self.tieba_name+"_"+str(page_num)+".html"

          with open(file_path, "w", encoding="utf-8") as f:

          f.write(html)

     


        def run(self):
          url_list = self.get_url_list()

          for url in url_list:
            html_str = self.parse_url(url)

            page_num = url_list.index(url)+1

            self.save_html(html_str,page_num)

     


    if __name__ == "__main__":
    tieba = TiebaSpider("蒋欣")
    tieba.run()

    life is short,i need python
  • 相关阅读:
    Error creating bean with name 'configurationPropertiesBeans'异常
    bootstrap.yml没有小绿叶,添加配置无提示,配置无法加载
    Hello from Docker
    windows jdk8 nacos-server-1.4.0本地搭建启动
    Myeclipse导入的maven项目相关依赖包无法下载
    Idea的Terminal窗口找不见
    Win10系统Win键无法使用
    git拉代码报"error: RPC failed; curl 56 OpenSSL SSL_read: SSL_ERROR_SYSCALL, errno 10054"
    团队作业6:复审与事后分析
    团队作业4:项目冲刺
  • 原文地址:https://www.cnblogs.com/lvhonglei-python/p/7525559.html
Copyright © 2020-2023  润新知