• 9. Request & 爬虫


    from urllib import request
    
    def get_request():
        with request.urlopen('http://www.baidu.com') as f:
            data=f.read()
            print('Status:',f.status,f.reason)
            for k,v in f.getheaders():
                print('%s:%s'%(k,v))
    
        print('Data:',data.decode('utf-8'))
    
    def main():
        print(get_request())
    
    if __name__=="__main__":
        main()
    

    利用requests完成get请求:

    import requests
    
    #利用requests完成get请求
    url="http://www.baidu.com"  #首先准备好url地址
    requests.get(url)
    result=requests.get(url)    #得到请求的状态码
    print(result.status_code)
    
    D:Pythonpython.exe D:/Work/Tools/python_workspace/python_2017/class_basic/temp3.py
    200
    

    输出状态码200,请求成功。

    利用requests完成post请求:

    import requests
    
    #利用requests完成get请求
    url="http://www.baidu.com"  #首先准备好url地址
    requests.get(url)
    result=requests.get(url)    #得到请求的状态码
    result_2=requests.post(url)
    print(result.status_code)
    print(result_2.status_code)
    
    D:Pythonpython.exe D:/Work/Tools/python_workspace/python_2017/class_basic/temp3.py
    200
    302
    

    输出状态码302。

    将返回结果打印出来:

    print(result.text)
    

    即可将返回结果打印出来(一整个页面)。

    如果页面支持json格式,还可以将返回的数据返回json格式:

    print(result.json())
    

    因此处不支持json格式,报错:

    D:Pythonpython.exe D:/Work/Tools/python_workspace/python_2017/class_basic/temp3.py
    Traceback (most recent call last):
      File "D:/Work/Tools/python_workspace/python_2017/class_basic/temp3.py", line 10, in <module>
        print(result.json())
      File "D:Pythonlibsite-packages
    equestsmodels.py", line 897, in json
        return complexjson.loads(self.text, **kwargs)
      File "D:Pythonlibjson\__init__.py", line 348, in loads
        return _default_decoder.decode(s)
      File "D:Pythonlibjsondecoder.py", line 337, in decode
        obj, end = self.raw_decode(s, idx=_w(s, 0).end())
      File "D:Pythonlibjsondecoder.py", line 355, in raw_decode
        raise JSONDecodeError("Expecting value", s, err.value) from None
    json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
    

    所以要确定支持json格式才可使用。

    前程无忧网址示例:

    import requests
    
    #利用requests完成get请求
    url="https://search.51job.com/list/040000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="  #首先准备好url地址
    requests.get(url)
    result=requests.get(url)    #得到请求的状态码
    result_2=requests.post(url)
    #print(result.status_code)
    print(result.text)
    
    D:Pythonpython.exe D:/Work/Tools/python_workspace/python_2017/class_requests/requests.py
    Traceback (most recent call last):
      File "D:/Work/Tools/python_workspace/python_2017/class_requests/requests.py", line 2, in <module>
        import requests
      File "D:WorkToolspython_workspacepython_2017class_requests
    equests.py", line 6, in <module>
        requests.get(url)
    AttributeError: module 'requests' has no attribute 'get'
    

    检查后发现,是因为python文件名命名为“reqursts”,rename后再次执行:

    D:Pythonpython.exe D:/Work/Tools/python_workspace/python_2017/class_requests/class1_requests.py
    <!DOCTYPE html>
    <html>
    <head>
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <meta http-equiv="Content-Type" content="text/html; charset=gbk">
    <link rel="icon" href="/favicon.ico" type="image/x-icon"/>
    ...
    ...
    

    有时我们必须添加头部信息:请求头headers才能请求成功,方法如下:

    import requests
    
    #利用requests完成get请求
    #首先准备好url地址
    url="https://sou.zhaopin.com/?jl=765&sf=8001&st=10000&kw=%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95&kt=3"
    
    #添加头部信息:请求头headers
    headers={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    "Accept-Encoding":"gzip, deflate, br",
    "Referer": "https://i.zhaopin.com/",
    "Host":"sou.zhaopin.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0",
    "Connection":"keep-alive"}
    
    result=requests.get(url,headers)
    #print(result.status_code)
    print(result.text)
    

    headers信息在浏览器页面使用F12-网络-所有-消息头查看。

  • 相关阅读:
    Android系统Recovery工作原理2update.zip差分包问题的解决
    学习 原理图1 认识 元器件
    ARM新GPU架构Midgard
    ARM新GPU架构Midgard
    10种图片防盗链的方法
    一个基于PDO的数据库操作类(新) + 一个PDO事务实例
    localhost与127.0.0.1的区别
    header ContentType类型
    PHP采集利器:Snoopy 试用心得
    一个简单易用的导出Excel类
  • 原文地址:https://www.cnblogs.com/xiaotufei/p/13338451.html
Copyright © 2020-2023  润新知