发送请求
'''
1.发送请求:
- pip3 install requests
'''
import requests
from urllib.parse import urlencode
# 1.请求url,请求方式GET
url = 'https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3' #
# 组织请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
# 2.通过requests模块发送GET请求
# 往url地址发送请求,并得到一个返回值,赋值给response变量
# response == response 对象
response = requests.get(url, headers=headers)
print(response.status_code) # 200成功
# 获取响应文本
print(response.text)
with open('baidu.html', 'w', encoding='utf-8') as f:
f.write(response.text)
demo2
import requests
from urllib.parse import urlencode
# 1.请求url,请求方式GET
url = 'https://www.baidu.com/s?' + urlencode({'wd': '美女'})
# 组织请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
# 2.通过requests模块发送GET请求
# 往url地址发送请求,并得到一个返回值,赋值给response变量
# response == response 对象
response = requests.get(url, headers=headers)
print(response.status_code) # 200成功
# 获取响应文本
print(response.text)
with open('meinv.html', 'w', encoding='utf-8') as f:
f.write(response.text)
demo3
import requests
# 1.请求url,请求方式GET
url = 'https://www.baidu.com/s?'
# 组织请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
# 2.通过requests模块发送GET请求
# 往url地址发送请求,并得到一个返回值,赋值给response变量
# response == response 对象
response = requests.get(url, headers=headers, params={'wd': '美女'})
print(response.status_code) # 200成功
# 获取响应文本
print(response.text)
with open('girl.html', 'w', encoding='utf-8') as f:
f.write(response.text)
通过get请求,绕过github登录
'''
1、携带github登录过后的cookies信息,访问github主页,绕过登录;
用户名:aaa
邮箱:bbb
密码:ccc
1.请求url:
- https://github.com/settings/emails
2.请求方式:
- GET
3.请求头:
- cookies:
Cookie: _octo=GH1.1.816288935.1571284371; has_recent_activity=1; tz=Asia%2FShanghai; _device_id=a00ea80a1caf3404bd890d1a520083a6; user_session=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; __Host-user_session_same_site=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; logged_in=yes; dotcom_user=Michaeljy; _gh_sess=Vy82WmxrcFZmREZvV0dnTUtIaENDVWhPY3M1dEpwRVNWelVmYjk3TWQzQVpVVS84S3hhOXgrMkhRR2lHZUhZNUwvQzU1U3RHUWxWNmhITDFVV3FFcTUreFdPaWsyT2hTZlllQ2Fld3dFVmR5Q0w2RDdKbEduZENidUxySkd6MXVOKzdXQURsRVpSZDVUUjY4TE83c0JCK3BrcWlmeFVBZzhxVGFiVDRPd01HUlI0M3FrVFNLZ3dyNlJ6K0pQaWdQZHFYUFdISFRHT1ZZc3k4UDNFaVB4eWhLY3NTZnRaK0JTb3JlZzhGd3o1eFBaeHh1eCtReFZ3dWFsQXBqSWdNc0ltODdVTTk2bjVrdytFekNyWGlEZnFSdGZLSTZ6eW1jY1pvUGJYWWRIU0RtK01BWC9yOFRqb2FPNENJRUZra3F2V250dVZWMys3ajl5ZHVXQThUVGRQSHhmSDNwUXdnTUFUb0grMytlT1Y5U1RqVEYzak4zV1RIRURVbktycXR5dG1NZVJYNWZLdGxVR1hZeFFmemV0dFhiZXhiZHRKTGw0RHZvYmh6NEJmdHRPVkdFSENzaHhRSjJzVWlya2R4N3N3S3FWVmxpVkl0cWs0bmUvNmhlMGpwU3hNdTFZKzI5eCt1c0RjRVViVjY4Z2E4YzFtbkRhVmtybUZabGg4VVVlaUtLV3FEUUVMdjRvUnU1amdpbnJUTStOV1NIbGpTRGNRc25pbjVlazZ5MzV0b2tVek1qaW1HUDhMWVU1bzhjN3BsYkQ1MXVkaWg2STROdDRCNWVxMUtCTUt2L0o1Y01aTWsxM1BQNUJSZUxVcnBWMXRYdStlTlA2ZFhiVERYRVcwYUZMeFlmNjNxUTN1a0dNSVQySlZQTnpiTTY2UDk2WWMycG9iQzB4SWlRNnFhM3RVRXFxUHF4ZHBmd2R6Rk0wcFQ3b0VvNkJCd1NQZGhGNThKaHdzcEtpSXZMUUxRWVVlOHUyVFFRQmZBdURBZnNXRk9JRTljQVMzWmExSXprWFI2RVMrYkNMaVJybGR0cC9vN09rWTVvYXdHRWxaMmQwdmYzenM5dDVLMXVpTEhkWnNXVkNyWTRVd3JiSWFUVkhDak9Qb0xUekxjTk5PWWxtb2d6V1hZSGNOMkhkdWlVNXY1TTExREc0RW9mUGlVK1FoYVNYLzRSM21OMGFabkZyWHBEOFBpNXBRSU9rYkRXYVczVi8yc3ZPYURJZGUzY1AwN3l0S0szci8yc1J6WmhkQ3ZFeVJuT1pkR2FkMys3d0xTa21QNE80M3hVby82TmJ4MG1DWFZ6ZDlDR3RMaUJBNXhSVHhnMmRwMWxNSTRTR2hRM1JXdncyRklEWlA3cTU2cGRpU0wyVUljZmtGVVNXN2ltTG54cW52QzloWitSZjRVdUh3NnNrcEVWdTFoM1JXNnFqVW55SXRDTmY5WXEvakQ0bk9lWlpFRTMwbm9qWkp0QXlWZkE2MzlQNXJUbk51WkttTFE4QVB3Y1pzZDc0Zlp0ei81U1FKS3JnUVV6SUZoMkhSZXBsL0M0VXBLTUNKd2YvQ2JJTmpVK2JtZ1J6VkE4MW9lVTlZRVVlOHRHd1dNUDJLYkFSbTFGbExuYmFtOHkyc0p6OTdOYnp4ZWJlTjdxR0V2eWZRYXlqeEhmTC92QlJGa2lmaVU9LS1HQmU4cXZCL0trdVVTMW5QVXkzWUdnPT0%3D--e417927d7291752b7527f64c960dbe38450f2543
- User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
- Referer: https://github.com/settings/profile
'''
import requests
url = 'https://github.com/settings/emails'
github_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Cookie': '_octo=GH1.1.816288935.1571284371; has_recent_activity=1; tz=Asia%2FShanghai; _device_id=a00ea80a1caf3404bd890d1a520083a6; user_session=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; __Host-user_session_same_site=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; logged_in=yes; dotcom_user=Michaeljy; _gh_sess=Vy82WmxrcFZmREZvV0dnTUtIaENDVWhPY3M1dEpwRVNWelVmYjk3TWQzQVpVVS84S3hhOXgrMkhRR2lHZUhZNUwvQzU1U3RHUWxWNmhITDFVV3FFcTUreFdPaWsyT2hTZlllQ2Fld3dFVmR5Q0w2RDdKbEduZENidUxySkd6MXVOKzdXQURsRVpSZDVUUjY4TE83c0JCK3BrcWlmeFVBZzhxVGFiVDRPd01HUlI0M3FrVFNLZ3dyNlJ6K0pQaWdQZHFYUFdISFRHT1ZZc3k4UDNFaVB4eWhLY3NTZnRaK0JTb3JlZzhGd3o1eFBaeHh1eCtReFZ3dWFsQXBqSWdNc0ltODdVTTk2bjVrdytFekNyWGlEZnFSdGZLSTZ6eW1jY1pvUGJYWWRIU0RtK01BWC9yOFRqb2FPNENJRUZra3F2V250dVZWMys3ajl5ZHVXQThUVGRQSHhmSDNwUXdnTUFUb0grMytlT1Y5U1RqVEYzak4zV1RIRURVbktycXR5dG1NZVJYNWZLdGxVR1hZeFFmemV0dFhiZXhiZHRKTGw0RHZvYmh6NEJmdHRPVkdFSENzaHhRSjJzVWlya2R4N3N3S3FWVmxpVkl0cWs0bmUvNmhlMGpwU3hNdTFZKzI5eCt1c0RjRVViVjY4Z2E4YzFtbkRhVmtybUZabGg4VVVlaUtLV3FEUUVMdjRvUnU1amdpbnJUTStOV1NIbGpTRGNRc25pbjVlazZ5MzV0b2tVek1qaW1HUDhMWVU1bzhjN3BsYkQ1MXVkaWg2STROdDRCNWVxMUtCTUt2L0o1Y01aTWsxM1BQNUJSZUxVcnBWMXRYdStlTlA2ZFhiVERYRVcwYUZMeFlmNjNxUTN1a0dNSVQySlZQTnpiTTY2UDk2WWMycG9iQzB4SWlRNnFhM3RVRXFxUHF4ZHBmd2R6Rk0wcFQ3b0VvNkJCd1NQZGhGNThKaHdzcEtpSXZMUUxRWVVlOHUyVFFRQmZBdURBZnNXRk9JRTljQVMzWmExSXprWFI2RVMrYkNMaVJybGR0cC9vN09rWTVvYXdHRWxaMmQwdmYzenM5dDVLMXVpTEhkWnNXVkNyWTRVd3JiSWFUVkhDak9Qb0xUekxjTk5PWWxtb2d6V1hZSGNOMkhkdWlVNXY1TTExREc0RW9mUGlVK1FoYVNYLzRSM21OMGFabkZyWHBEOFBpNXBRSU9rYkRXYVczVi8yc3ZPYURJZGUzY1AwN3l0S0szci8yc1J6WmhkQ3ZFeVJuT1pkR2FkMys3d0xTa21QNE80M3hVby82TmJ4MG1DWFZ6ZDlDR3RMaUJBNXhSVHhnMmRwMWxNSTRTR2hRM1JXdncyRklEWlA3cTU2cGRpU0wyVUljZmtGVVNXN2ltTG54cW52QzloWitSZjRVdUh3NnNrcEVWdTFoM1JXNnFqVW55SXRDTmY5WXEvakQ0bk9lWlpFRTMwbm9qWkp0QXlWZkE2MzlQNXJUbk51WkttTFE4QVB3Y1pzZDc0Zlp0ei81U1FKS3JnUVV6SUZoMkhSZXBsL0M0VXBLTUNKd2YvQ2JJTmpVK2JtZ1J6VkE4MW9lVTlZRVVlOHRHd1dNUDJLYkFSbTFGbExuYmFtOHkyc0p6OTdOYnp4ZWJlTjdxR0V2eWZRYXlqeEhmTC92QlJGa2lmaVU9LS1HQmU4cXZCL0trdVVTMW5QVXkzWUdnPT0%3D--e417927d7291752b7527f64c960dbe38450f2543'
}
# 1.发送请求
response = requests.get(
url,
# headers关键字参数
headers=github_headers
)
print(response.status_code) # 200
print('bbb' in response.text) # True
with open('emails.html', 'w', encoding='utf-8') as f:
f.write(response.text)
通过post请求永久绕过github登录
'''
用户名:aaa
邮箱:bbb
密码:ccc
1.先分析 http 的请求流程
- 请求url:
Request URL: https://github.com/session
- 请求方式:
Request Method: POST
- 请求头:
- Referer: https://github.com/login
- User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
- 请求体: (form data):
commit: Sign in
utf8: ✓
authenticity_token: finQLn5TxHAabDmefQ2EbBXV27jDGlWm6DUu+u5u4J6QnPXBmkc76/QlDpx61v1NFf3AP8r+vg1Cq31G9Wxenw==
ga_id:
login: aaa
password: bbb
webauthn-support: supported
webauthn-iuvpaa-support: supported
required_field_a359:
timestamp: 1577696492100 # 时间戳
timestamp_secret: 03e50e82485174cadc2dda90916b93bfeadef0ac92643cbfde40e6c7f598bbb6
- 1) 先往https://github.com/login页面发送get请求,获取authenticity_token与timestamp_secret随机加密字符串
- 2) 携带加密字符串与请求体所有的信息,一并通过post请求访问https://github.com/session
2.再写爬虫代码
'''
import requests
import re
# 1) 先往https://github.com/login页面发送get请求,获取authenticity_token与timestamp_secret随机加密字符串
url = 'https://github.com/login'
login_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
login_response = requests.get(url, headers=login_headers)
# print(login_response.status_code)
# print(type(login_response.text))
# 2)先解析获取authenticity_token与timestamp_secret, 通过re模块实现
authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />',
login_response.text,
re.S)[0]
timestamp_secret = re.findall('<input type="hidden" name="timestamp_secret" value="(.*?)" class="form-control" />',
login_response.text,
re.S)[0]
# print(authenticity_token)
# print(timestamp_secret)
# 3) 携带加密字符串与请求体所有的信息,一并通过post请求访问https://github.com/session
form_data = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': authenticity_token,
'ga_id': '',
'login': 'aaa',
'password': 'bbb',
'webauthn-support': 'supported',
'webauthn-iuvpaa-support': 'supported',
'required_field_a359': '',
'timestamp': 1577696892274, # 时间戳
'timestamp_secret': timestamp_secret
}
session_url = 'https://github.com/session'
session_response = requests.post(
session_url,
data=form_data,
cookies=login_response.cookies
)
print(session_response.status_code)
print(session_response.cookies)
# 4) 登录后,直接访问github主页, 前提是需要携带登录成功后的用户cookies值
# 携带cookies值的两种方式:
# - headers: 携带cookies值
# - cookies: 可以添加cookies值
# index_response = requests.get('https://github.com/', cookies=session_response.cookies)
# print(index_response.text)
# with open('github.html', 'w', encoding='utf-8') as f:
# f.write(index_response.text)
# 5) 验证是否登录成功,校验邮箱
emails_response = requests.get('https://github.com/settings/emails', cookies=session_response.cookies)
print('aaa' in emails_response.text)
爬取梨视频
#爬取视频
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=48&mrd=0.9993282952193101&filterIds=1625835,1625642,1625837,1625841,1625870,1625869,1625813,1625844,1625801,1625856,1625857,1625847,1625838,1625827,1625787
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0
#获取视频
import re
res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')
reg_text='<a href="(.*?)" class="vervideo-lilink actplay">'
obj=re.findall(reg_text,res.text)
print(obj)
for url in obj:
url='https://www.pearvideo.com/'+url
res1=requests.get(url)
obj1=re.findall('srcUrl="(.*?)"',res1.text)
print(obj1[0])
name=obj1[0].rsplit('/',1)[1]
print(name)
res2=requests.get(obj1[0])
with open(name,'wb') as f:
for line in res2.iter_content():
f.write(line)