• 【Python】python 爬虫学习


    response = requests.get("http://www.baidu.com")

     response.content.decode("utf-8")  返回bytes类型 decode解码

     response.text    request.encoding = "gbk" # 修改编码 返回str类型 

     获取图片

    # coding=utf-8
    import requests
    url = "http://wap.jiapai.net.cn/images/1.jpg"
    
    response = requests.get(url)
    with open("baidu.png","wb") as f:
        f.write(response.content)
                                 

    ---

    # 状态码 

    response.status_code 

    # 响应头

    response.headers

    # 请求头

    response.request.headers

    200
    {'Content-Length': '20851', 'Content-Type': 'image/jpeg', 'Last-Modified': 'Sun, 28 Jul 2019 04:29:48 GMT', 'Accept-Ranges': 'bytes', 'ETag': '"1f3f6d17fd44d51:0"', 'Set-Cookie': 'sdwaf-test-item=1ed57f5405075208510954035156575b5c5754065406040d015701515e520c; path=/; HttpOnly', 'X-Powered-By': 'SDWAF', 'Date': 'Tue, 05 May 2020 01:56:48 GMT'} {'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

    ---

    # 发送带header的请求 

    # coding=utf-8
    import requests
    url = "http://wap.jiapai.net.cn/images/1.jpg"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    
    response = requests.get(url,headers=headers)
    print(response.status_code)
    print(response.headers)
    print(response.request.headers)

    ---

    # 发送带参数的请求 

    params = {"":""}

    url_temp = "www.baidu.com/s?"

    requests.get(url_temp,params=params)

    ---

    # 占位符 建议使用format+ {} 代替 

    input_string = input("")

    url = "http://www.baidu.com/s?wd={}".format(input_string) || url = "https://www.baidu.com/s?wd=%s"%input_string 

    ---

    列表推导式

    # 范围0~9 

    [i for i in range(10)]

    # i对2取余 输出 

    [i%2 for i in range(10)]

    # i对2取余,如果对而取余等于0 则输出 

    [i%2 for i in range(10) if i%2==0]

    ---

    ## 面向对象 

    - 对象

      - 生活中的事务

    - 类

      - 对事务的抽象 在代码中实现class 类型

    - 实例

      - 使用之前对类的实例化之后的结果

    --- 

    # get 请求贴吧 

    # coding=utf-8
    import requests
    
    class TiebaSpider:
        def __init__(self,tieba_name):
            self.tieba_name = tieba_name
            self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"
            self.headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    
        def get_url_list(self):
            return [self.url_temp.format(i*50) for i in range(10)]
    
        def parse_url(self, url):
            response = requests.get(url, headers=self.headers)
            return response.content.decode("utf-8")
    
        def save_html_str(self, html_str, page_num):
            file_path = "{}-第{}页".format(self.tieba_name, page_num)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(html_str)
    
        def run(self):
            # 1 构造url 
            url_list = self.get_url_list()
    
            # 2 发送请求 获取相应 
    for url in url_list:
                html_str = self.parse_url(url)
    
            # 3 保存 
                page_num = url_list.index(url) + 1
                self.save_html_str(html_str, page_num)
    
    if __name__ == "__main__":
        tieba_spider = TiebaSpider("李毅")
        tieba_spider.run()
                                          

    # 保存贴吧内容到本地 

    ---

    # Post 请求  安全 大文本传输 

    data = {"":""}  # 字典

    requests.post("https://www.baidu.com",data = data,headers=headers)

  • 相关阅读:
    一失足千古恨在 WSL 中使用了 md 创建文件夹 (2020-04-26)
    开源中国 ThinkPHP 领奖
    投资投机脑图(2019-12-12)
    什么? 1XIN = 21BTC
    笔记:投机和投资 F4NNIU
    如何设置单个 Git 仓库的代理从而提高更新速度
    FastAdmin 使用 phpmail 出现 spl_autoload_register 错误
    plsql 引用型变量
    oracle 存储函数
    oracle存储过程(带参数的存储过程)
  • 原文地址:https://www.cnblogs.com/oscarli/p/12829574.html
Copyright © 2020-2023  润新知