• python16_day36【爬虫1】


    一、requests

      1. GET请求

     1 # 1、无参数实例
     2  
     3 import requests
     4  
     5 ret = requests.get('https://github.com/timeline.json')
     6  
     7 print(ret.url)
     8 print(ret.text)
     9  
    10  
    11
    12 # 2、有参数实例
    13  
    14 import requests
    15  
    16 payload = {'key1': 'value1', 'key2': 'value2'}
    17 ret = requests.get("http://httpbin.org/get", params=payload)
    18  
    19 print(ret.url)
    20 print(ret.text)

      2.POST请求

     1 # 1、基本POST实例
     2  
     3 import requests
     4  
     5 payload = {'key1': 'value1', 'key2': 'value2'}
     6 ret = requests.post("http://httpbin.org/post", data=payload)
     7  
     8 print(ret.text)
     9  
    10  
    11 # 2、发送请求头和数据实例
    12  
    13 import requests
    14 import json
    15  
    16 url = 'https://api.github.com/some/endpoint'
    17 payload = {'some': 'data'}
    18 headers = {'content-type': 'application/json'}
    19  
    20 ret = requests.post(url, data=json.dumps(payload), headers=headers)
    21  
    22 print(ret.text)
    23 print(ret.cookies)

      3.其它请求  

     1 requests.get(url, params=None, **kwargs)
     2 requests.post(url, data=None, json=None, **kwargs)
     3 requests.put(url, data=None, **kwargs)
     4 requests.head(url, **kwargs)
     5 requests.delete(url, **kwargs)
     6 requests.patch(url, data=None, **kwargs)
     7 requests.options(url, **kwargs)
     8  
     9 # 以上方法均是在此方法的基础上构建
    10 requests.request(method, url, **kwargs)
    View Code

      4.汽车之家新闻

     1 import requests
     2 from bs4 import BeautifulSoup
     3 
     4 # response = requests.get("http://www.autohome.com.cn/news/")
     5 # # response.text 是str
     6 # # response.content 是bytes二进制
     7 #
     8 # response.encoding = 'gbk'   # 网站使用了gbk
     9 # root = BeautifulSoup(response.text, 'html.parser')   # 将返回结果拿到用bs解析
    10 # outer_div_obj = root.find(name='div', id='auto-channel-lazyload-article')  # 打到div id='xx'
    11 # li_obj_list = outer_div_obj.find_all(name='li')     # 拿到里面所有的LI
    12 #
    13 # for li_obj in li_obj_list:
    14 #     if not li_obj.find('h3'):
    15 #         continue
    16 #     title_obj = li_obj.find('h3')       # 拿到对象 H3标签
    17 #     summary_obj = li_obj.find('p')      # 拿到对象 P标签
    18 #     img_obj = li_obj.find('img')        # 拿到对象 IMG标签
    19 #     src = img_obj.attrs.get('src')      # 从IMG标签对象中拿到src属性
    20 #
    21 #     print(src, title_obj.text, summary_obj.text)
    22 
    23 response = requests.get("http://www.autohome.com.cn/news/")
    24 response.encoding = 'gbk'
    25 
    26 soup = BeautifulSoup(response.text, 'html.parser')
    27 tag = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
    28 li_list = tag.find_all('li')
    29 
    30 for li in li_list:
    31     h3 = li.find('h3')
    32 
    33     if not h3:
    34         continue
    35     print("33[33;1m标题: {0}33[0m".format(h3.text))
    36     print("33[34;1m路径: http://{0}33[0m".format(li.find('img').attrs['src']))
    37     print("33[34;1m内容: {0}33[0m".format(li.find('p').text))

      5.github登录

     1 #!/usr/bin/env python
     2 # -*-coding:utf8-*-
     3 # __author__ = "willian"
     4 
     5 import requests
     6 from bs4 import BeautifulSoup
     7 # 第一次请求: 获取 token and cookie
     8 r1 = requests.get('https://github.com/login')
     9 b1 = BeautifulSoup(r1.text, 'html.parser')
    10 # get token
    11 auth_token = b1.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
    12 # get cookies
    13 r1_cookie_dict = r1.cookies.get_dict()
    14 
    15 
    16 # 第二次请求: 发送用户认证
    17 r2 = requests.post("https://github.com/session",
    18                    data={
    19                        'commit': "Sign in",
    20                        'utf8': '',
    21                        'authenticity_token': auth_token,
    22                        'login': '',
    23                        'password': ""
    24                    }, cookies=r1_cookie_dict)
    25 # get cookies
    26 r2_cookie_dict = r2.cookies.get_dict()
    27 
    28 # 将两次的cookies合并
    29 all_cookie_dict = {}
    30 all_cookie_dict.update(r1_cookie_dict)
    31 all_cookie_dict.update(r2_cookie_dict)
    32 
    33 
    34 # 第三次请求:只有登录成功之后获取个人页面
    35 r3 = requests.get('https://github.com/settings/emails', cookies=all_cookie_dict)
    36 print(r3.text)

       6.抽屉点赞

     1 #!/usr/bin/env python
     2 # -*-coding:utf8-*-
     3 # __author__ = "willian"
     4 
     5 import requests
     6 from bs4 import BeautifulSoup
     7 
     8 # 1. 请求获取cookies
     9 r0 = requests.get("http://dig.chouti.com")
    10 r0_cookie_dict = r0.cookies.get_dict()
    11 
    12 
    13 # 2. 授权
    14 r1 = requests.post(
    15     url="http://dig.chouti.com/login",
    16     data={
    17         'phone': 'xx',
    18         'password': 'xx',
    19         'oneMonth': 1
    20     },
    21     cookies=r0_cookie_dict
    22 )
    23 r1_cookie_dict = r1.cookies.get_dict()
    24 
    25 all_cookies = {}
    26 all_cookies.update(r0_cookie_dict)
    27 all_cookies.update(r1_cookie_dict)
    28 
    29 # 3.点赞
    30 r2 = requests.post(url='http://dig.chouti.com/link/vote?linksId=14808951', cookies=all_cookies)
    31 print(r2.text)

    二、Beautfulsoup4

    三、wechat

  • 相关阅读:
    使用VB获得系统目录路径
    自动点击页面中的JavaScript链接
    vb 随机数
    进程监控程序 监视进程的创建
    UTF8 Encoding rules
    How to create and apply patch by GIT
    Java Access Bridge Why IsJavaWindow always returns false?
    TomatoUSB 挂载了/jffs/opt之后登录出现$提示符以及VI内存耗尽等各种异常
    MMU解释 MMU启用后,CPU的内存访问请求会被MMU截获
    如何调整Linux内核启动中的驱动初始化顺序
  • 原文地址:https://www.cnblogs.com/weibiao/p/7704957.html
Copyright © 2020-2023  润新知