爬虫步骤
- 确定爬取目标的url
- 使用python代码发送请求获取数据
- 解析获取到的数据(精确数据)
- 找到新的目标(新的url)回到第一步,再次获取 -- 自动化
- 数据持久化
- python3(原生提供的模板):urllibb.request
- urlopen
- 返回response对象
- response.read()
- bytes.decode('utf-8')
- get:传参
- 汉字报错:解释权ASCII没有汉字,url汉字转码,urllib.parse.quote safa=string.printable
- 字典传参:urllib.parse.urlencode()
- post
- urllib.request.urlopen(url,data="服务器接收的数据')
- handle处理器的自定义
- User-Agent
- 模拟真实的浏览器发送请求(百度批量搜索,user agent大全)
- request.add_header(key,value) -- 动态添加header数据
- 响应头:response.header
- 创建请求:request = urllib.request.Request(url)
- 代理IP:
- handler -- opener -- opener.open(url)
- urlError
- python2(原生提供):urllib2
- request
- 数据解析:xpath、bs4
- 数据存储:json,CSV,MongoDB,mysql
代码
1、urllib.request
01-url_open_code.py
## 不带参数请求
import urllib.request
# Create your views here.
def load_data():
#url = "https://www.baidu.com/" ## https返回内容考虑安全性,http是全部的
url = "http://www.baidu.com/"
## get请求
## http请求
## response:http响应的对象
response = urllib.request.urlopen(url)
print(response)
## 读取内容 bytes类型
data = response.read()
#print(data)
## 将文件获取的内容转成字符串
str_data = data.decode('utf-8')
#print(str_data)
## 将数据写入文件
with open("baidu.html","w",encoding="utf-8") as f:
f.write(str_data)
## 将字符串类型转为二进制
str_name = "baidu"
bytes_name = str_name.encode('utf-8')
print(bytes_name)
## python 爬取的类型 :str bytes
##如果爬取结果是bytes类型,但写入需要str类型,就decode('utf-8')
##如果爬取结果是str类型,但写入需要bytes类型,就encode('utf-8')
load_data()
02-get_paras.py
import urllib.request
import urllib.parse ## 汉字转义模块
import string ## 汉字转义模块
## 传一个参数
def get_method_paras():
url = "http://www.baidu.com/s?wd="
## 拼接字符串(汉字)
name = "美女"
final_url = url + name
print(final_url)
## python 可以接受的数据类型,不包含汉字:https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3
## 代码发送了请求,但网址里面包含了汉字;ascii是没有汉字的,需要使用url转义
encode_new_url = urllib.parse.quote(final_url,safe=string.printable)
print(encode_new_url) ## 结果是:将汉字"美女"转为ASCII类型了
## 使用代码发送网络请求
## response = urllib.request.urlopen(final_url) ## 包含汉字,报错,需要转为ASCII码的encode_new_url
response = urllib.request.urlopen(encode_new_url)
print(response) ## 请求结果是一个对象
data = response.read().decode() ## 读取内容,decode()默认为utf-8
print(data)
with open("02-encode.html","w",encoding='utf-8') as f:
f.write(data)
get_method_paras()
02-get_params2.py
import urllib.request
import urllib.parse
import string
## 传多个参数,使用字典拼接
def get_params():
url = "http://www.baidu.com/s?"
## url多参数,使用字典,再将字典中冒号转为等号
params = {
'wd':'中文',
'key':'zhang',
'value':'san'
}
## 将冒号变为等号,并连接在一起
str_params = urllib.parse.urlencode(params)
print(str_params) ## 结果:wd=%E4%B8%AD%E6%96%87&key=zhang&value=san
final_url = url + str_params ## url拼接
new_url = urllib.parse.quote(final_url,safe=string.printable) ## url中文编码
print(new_url)
response = urllib.request.urlopen(new_url) ## 请求网页,返回对象
print(response)
data = response.read().decode('utf-8') ## 读取对象
print(data)
get_params()
import urllib.request
## 只创建请求,不加请求头信息
def load_baidu():
url = "http://www.baidu.com"
## 使用url创建请求对象
request = urllib.request.Request(url)
## 请求网络数据
## response = urllib.request.urlopen(url)
response = urllib.request.urlopen(request)
print(response)
data = response.read().decode('utf-8')
## 响应头信息
print(response.headers)
## 获取请求头信息
req_headers = request.headers
print(req.headers) ## 请求头信息为空
with open("03-headers.html","w",encoding='utf-8')as f:
f.write(data)
load_baidu()
import urllib.request
## 创建请求,并加请求头信息,请求头信息为指定的情况
def load_baidu():
url = "http://www.baidu.com"
header = {
## 浏览器的版本
'haha':'hehe', ## 无用信息,做测试获取请求头使用
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
## 使用url创建请求对象,默认方法添加请求头信息
#request = urllib.request.Request(url,headers=header)
## 使用url创建请求对象,动态添加请求头信息
request = urllib.request.Request(url)
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36')
## 获取完整的url
full_url = request.get_full_url()
print(full_url)
## 请求网络数据
## response = urllib.request.urlopen(url)
## 不在此处添加请求头信息,因为urlopen方法没有提供参数
response = urllib.request.urlopen(request)
print(response)
data = response.read().decode('utf-8')
## 响应头信息
print(response.headers)
## 获取请求头信息
request_headers = request.headers ## 获取所有请求头信息,返回字典
print(request_headers)
request_headers_User_agent = request.get_header("User-agent") ## ## 获取指定请求头信息
print(request_headers_User_agent)
# with open("03-headers.html","w",encoding='utf-8')as f:
# f.write(data)
load_baidu()
04-random_user_agent.py
import urllib.request
import random
def load_baidu():
url = "https://www.baidu.com"
## user-agent列表
user_agent_list = [
## win7
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50'
]
## 每次请求的浏览器都不一样
random_user_agent = random.choice(user_agent_list)
## 新建请求对象
request = urllib.request.Request(url)
## 动态添加请求头
request.add_header('User-Agent',random_user_agent)
## 获取请求头信息
print(request.get_header('User-agent')) ## 此处的User-agent,首字母大写,其余必须全部小写
## 请求数据
response = urllib.request.urlopen(request)
print(response)
load_baidu()
05-handler_openner.py
import urllib.request
## 作用:添加代理
## 系统的urlopen并没有添加代理的功能,所以需要我们自定义这个功能
## ssl,安全套接层 ,ssl 第三方的CA数字证书
## http,80端口,HTTPS,443端口
## urlopen可以请求数据,是因为有handler处理器
## 自己的opener 请求数据
def handler_openner():
url = "https://www.cnblogs.com/moox/tag"
#urllib.request.urlopen()
## 创建自己的处理器
handler = urllib.request.HTTPHandler()
## 创建自己的opener
opener = urllib.request.build_opener(handler)
## 用自己创建的opener调用open方法请求数据
response = opener.open(url)
data = response.read()
print(data)
handler_openner()
06-proxy_handler.py
import urllib.request
### 06-proxy_handler
def create_proxy_handler():
url = "https://www.baidu.com"
## 添加一个免费代理协议
proxy = {
## 免费版写法,西刺免费代理
# "http":"http:120.77.249.46:8080"
"http":"120.77.249.46:8080" ## 简写
}
## 代理处理器
proxy_handler = urllib.request.ProxyHandler(proxy)
## 创建自己的opener
opener = urllib.request.build_opener(proxy_handler)
data = opener.open(url).read()
print(data)
create_proxy_handler()
06-random_user_proxy
import urllib.request
def create_proxy():
url = "https://www.cnblogs.com/moox/tag"
# 添加多个免费代理协议 06-random_user_proxy
proxy_list = [
{"http":"1.0.9.41:8080"},
{"http":"120.77.249.42:8080"},
{"http":"120.77.249.43:8080"}
]
for proxy in proxy_list:
print(proxy)
proxy_handler = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy_handler)
try:
opener.open(url,timeout=0.1)
print("haha")
except Exception as e:
print(e)
create_proxy()
07-money_proxy_handler_1
import urllib.request
## 付费的代理发送 07-money_proxy_handler_1
# 1.带着用户名和密码发送
def money_prosy_use():
url = "https://www.baidu.com"
## 第一种方式
print("第一种方法")
# 1.代理ip
money_proxy = {"http":"username:pwd@192.168.12.11:8080"}
# 2. 代理的处理器handler
proxy_handler = urllib.request.ProxyHandler(money_proxy)
# 3.通过处理器创建opener
opener = urllib.request.build_opener(proxy_handler)
# 4. open 发送请求
try:
response = opener.open(url)
print(response)
except Exception as e:
print(e)
## 第二中方法
print("第二种方法")
## 1.个人信息
username = "abcname"
pwd = "123456"
proxy_money = "123.123.123.123:8888"
## 2、创建密码管理,添加用户名和密码
password_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None,proxy_money,username,pwd)
## 3、创建可以验证代理ip的处理器
handler_auth_proxy = urllib.request.ProxyBasicAuthHandler(password_manager)
## 4、根据处理器创建opener
opener_auth = urllib.request.build_opener(handler_auth_proxy)
## 4、发送请求
response = opener_auth.open(url)
print(response.read())
## 爬取自己公司的数据,做数据分析
## 07-auth_use.py
money_prosy_use()
07-auth_use_nei_wang2
import urllib.request
## 内网请求,爬取自己公司的数据 07-auth_use_nei_wang2
def auth_nei_wang():
# 1.用户名密码
user = "admin"
pwd = "admin123"
nei_url = "http://172.168.179.66"
# 2.创建密码管理器
pwd_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
pwd_manager.add_password(None,nei_url,user,pwd)
# 3. 创建认证处理器(一般使用requests比较多)
auth_handler = urllib.request.HTTPBasicAuthHandler(pwd_manager)
# 4. 根据handler创建opener
opener = urllib.request.build_opener(auth_handler)
# 5. 发送请求
response = opener.open(nei_url)
print(response)
auth_nei_wang()
08-cookies_1
import urllib.request
# 08-cookies_1
## 登录成功后,获取到的数据为未登录信息,未使用cookies
# 1.数据url
url = "https://www.yaozh.com/member/" ## 登录后的个人信息页面
# 2. 添加请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
# 3. 构建请求对象
request = urllib.request.Request(url,headers=headers)
# 4.发送请求对象
response = urllib.request.urlopen(request)
# 5.读取数据
data = response.read()
print(type(data))
## 注意,此处的data为bytes类型,写入文件需要str类型
## 方法一:data = response.read().decode("utf-8") 注意考虑gbk
## 方法二:写入文件时,使用bytes类型写入,即wb
# 6.保存到文件中,验证数据
with open("08-cookies.html","wb") as f:
f.write(data)
08-cookies_2
import urllib.request
## 08-cookies_2
'''
登录成功后,直接获取个人中心的页面
手动粘贴,复制 pc抓包的cookies信息
放在request 对象的请求头里面
'''
# 1.数据url
url = "https://www.yaozh.com/member/" ## 登录后的个人信息页面
# 2. 添加请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
,
'Cookie': 'acw_tc=2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574; PHPSESSID=qp7vu5k86b80o99nk3ne2nqeo6; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1595128086; _ga=GA1.2.1668375763.1595128086; _gid=GA1.2.1688163507.1595128086; yaozh_logintime=1595128155; yaozh_user=956263%09moox2020; yaozh_userId=956263; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyXnoaabJtnm5eHnKZxanJT1qeSoMZYoNdzb5tan9LU2pOUlpFZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmJ6eCB926858ECd33fF24d2161B6ecf9232XkpackmyaV6DXn5VtWamhnsZbbKabZ5ieW2iWcWeUl5qSmZuYaJ1XoOE%3D6e6fa20636976fac57f639153c479218; _gat=1; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1595128158; yaozh_uidhas=1; yaozh_mylogin=1595128161; acw_tc=2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574'
}
# 3. 构建请求对象
request = urllib.request.Request(url,headers=headers)
# 4.发送请求对象
response = urllib.request.urlopen(request)
# 5.读取数据
data = response.read()
print(type(data))
## 注意,此处的data为bytes类型,写入文件需要str类型
## 方法一:data = response.read().decode("utf-8") 注意考虑gbk
## 方法二:写入文件时,使用bytes类型写入,即wb
# 6.保存到文件中,验证数据
with open("08-cookies.html","wb") as f:
f.write(data)
08-cookies_3
import urllib.request
from http import cookiejar ## cookiejar能自动保存cookie
from urllib import parse
## 08-cookies_3
'''
模拟登陆,直接获取个人中心的页面
1. 代码登录 登录成功后,就有了cookie(有效的)
2. 自动带着cookie 去请求个人中心
放在request 对象的请求头里面
3. 抓包时注意选上Preserve log ,能保存上个额页面的请求,不然刷新之后就没有登录页面的信息了
'''
def login_cookie():
# 1.代码登录
# 1.1 登录后的网址
login_url = "https://www.yaozh.com/login"
# 1.2 登录的参数
## 注意:登录的参数应该在登录之前来找,但需要看登录后有些什么。
## 登录之前的,登录页的网址:https://www.yaozh.com/login
## 此处登录前后网址一样,后台是根据发送的请求方式来判断的,
## 如果是get请求,就是登录页面,如果是post请求,就是登录的返回结果
"""
登录后的参数
username: moox2020
pwd:
formhash: 609FDC2169
backurl: https%3A%2F%2Fwww.yaozh.com%2F
"""
# 在登录前的页面Elemenets中找到forhash和backurl
login_form_data = {
"username":"moox2020",
"pwd":"",
"formhash":"4D19C2552E",
"backurl":"https%3A%2F%2Fwww.yaozh.com%2F"
}
## 注意:1.字典类型的参数data,需要使用parse转译或转码;2.post请求的data要求是bytes;3.汉字需要转译
login_str = parse.urlencode(login_form_data)
## 报错:TypeError: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str.
login_bytes = login_str.encode('utf-8')
# 1.3 添加cookiejar,发送登录请求POST
## 保存cookie的作用
cook_jar = cookiejar.CookieJar()
## 定义有添加cookie功能的处理器
cook_handler = urllib.request.HTTPCookieProcessor(cook_jar)
## 根据处理器,生成opener
opener = urllib.request.build_opener(cook_handler)
## 添加请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
## 构造请求
login_request = urllib.request.Request(login_url,headers=headers,data=login_bytes)
## 带着参数,发送post请求
## 如果登录成功,cookiejar能自动保存cookie
opener.open(login_request) ## 带着cookie的opener,只要成功即可,不关心返回内容
# 2. 代码带着cookie去访问个人中心
## 实质:登录后获取到请求头中的cookie,再将该cookie装到新请求中,比较麻烦,考虑cookiejar
## python另一种实现,cookieJar
center_url = "https://www.yaozh.com/member/"
center_request = urllib.request.Request(center_url,headers=headers)
response = opener.open(center_request)
## response对象是bytes类型 --> str类型
## data = response.read().decode('utf-8') ## 类型可以在element head中看到
data = response.read()
with open("08-cookies_login.html","wb") as f: ## "w"的方式报错,所以不提前转化格式,直接"wb"来写
f.write(data)
login_cookie()
09-http_error
## urllib.request 有两种错误类型,
# HTTPError,URLError,http是url的子类
# 09-http_error
import urllib.request
'''
url = "http://www.zhongsaannghh.com.cn"
response = urllib.request.urlopen(url)
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 11004] getaddrinfo failed>
'''
url = "https://mbd.baidu.cn/newspage"
try:
response = urllib.request.urlopen(url)
except urllib.request.HTTPError as e:
print(e.code)
except urllib.request.URLError as e:
print(e)
2、requests
10.1-requests_content_text
# 1. 安装requests模块:pip install requests
# request的基本用法:content,text
## 10.1-requests_content_text
import requests
url = "http://www.baidu.com"
response = requests.get(url)
## content属性,返回的类型是bytes,需要时可以转为str,decode("...")
# data = response.content
data = response.content.decode('utf-8') ## 直接使用conten,不是content(),也不是read()
print(type(data))
## text属性,返回类型为str,但可能出错,乱码等,优先使用content
data = response.text
print(type(data))
## 带请求头的requests
## 10.2-requests_headers_cookie
import requests
class RequestSpider(object):
def __init__(self):
url = "http://www.baidu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
self.response = requests.get(url,headers=headers)
def run(self):
self.data = self.response.content
print(type(self.data))
## 1. 获取请求头
request_headers = self.response.request.headers
print(request_headers)
## 2. 获取响应头
response_headers = self.response.headers
print(response_headers)
## 3. 获取响应状态码
code = self.response.status_code
print(code)
## 4. 请求的cookie
request_cookie = self.response.request._cookies
print(request_cookie)
## 5. 响应的cookie
response_cookie = self.response.cookies
print(response_cookie)
# 实例化类调用
# b = RequestSpider()
# b.run()
# 类直接调用
RequestSpider().run()
10.3-requests_params
## url 自动转译
## 10.3-requests_params
import requests
# url = "https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3"
## 1.参数:中文的"美女" 会自动转译
url = "https://www.baidu.com/s?wd=美女"
## url包含字典传参,也会自动转译
url_base = "https://www.baidu.com/s"
url_params = {
"wd":"美女"
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
# response = requests.get(url,headers=headers)
response = requests.get(url_base,headers=headers,params=url_params)
data = response.content
with open("baidu_params-10.html","wb") as f:
f.write(data)
10.4-requests_json
## json格式转化为字典类型或列表类型
## requests.post(url,data=(参数{}),json=(json字符串))
## 10.4-requests_json
'''
https://api.github.com/user 的内容不是HTML,而是标准的json:
{
"message": "Requires authentication",
"documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
}
'''
import requests
import json
url = "https://api.github.com/user"
headers = {
#'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
# 'User-Agent':'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50'
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
}
response = requests.get(url,headers=headers)
# 1. 使用content方式获取json指定字段message的值
# ## str
# data = response.content.decode("utf-8")
# print(data) ## 返回结果是字符串,是json格式的,若只想去message,需要转为字典
# ## str --> dict
# data_dict = json.loads(data)
# print(data_dict['message'])
# 2. 直接使用json(),自动将字符串转换成python dict list
data = response.json()
print(type(data)) ## 返回类型直接为<class 'dict'>
print(data)
print(data['message'])
11.1-requests-auth
import requests
## 11.1-requests-auth
## 发送post请求
url = ""
data = {
}
response = requests.post(url,data=data)
## 内网需要认证时使用
# auth = (user,pwd) ## 元组
# response = requests.get(url,auth=auth)
11.2-requests-proxy
import requests
## 11.2-requests-proxy
## 发送post请求
url = "https://www.baidu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
free_proxy = {'http':'27.17.45.90:43411'}
response = requests.get(url,headers=headers,proxies=free_proxy)
print(response.status_code) ## 返回200就代表成功
11.3-requests_ssl
## ssl 认证才能访问的情况
## 11.3-requests_ssl
import requests
url = "https://www.12306.cn/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
## 报证书错误时:
## 因为https是有第三方CA证书的,但12306是自己颁布的整数
## 解决办法:告诉HTTPS,12306不是CA证书,是自己颁布的,即告诉web,忽略证书访问
# response = requests.get(url,headers=headers) ## 实际上现在不适用证书已经能够访问了
response = requests.get(url,headers=headers,verify=False)
data = response.content
# print(data)
with open("11.3-ssl.html","wb") as f:
f.write(data)
## 注意:如果取到的都是js代码,看网站是否正确。
11.4-requests_cookies
import requests
## 11.4-requests_cookies
## 直接粘贴复制登录后的cookie
url = "https://www.yaozh.com/member/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
cookies = 'PHPSESSID=qp7vu5k86b80o99nk3ne2nqeo6; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1595128086; _ga=GA1.2.1668375763.1595128086; _gid=GA1.2.1688163507.1595128086; yaozh_userId=956263; _gat=1; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1595128158; yaozh_uidhas=1; yaozh_mylogin=1595128161; acw_tc=2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574; UtzD_f52b_saltkey=gtc7UEo6; UtzD_f52b_lastvisit=1595126252; _ga=GA1.1.402802462.1595129856; _gid=GA1.1.1215858104.1595129856; UtzD_f52b_ulastactivity=1595128155%7C0; UtzD_f52b_creditnotice=0D0D2D0D0D0D0D0D0D799970; UtzD_f52b_creditbase=0D0D0D0D0D0D0D0D0; UtzD_f52b_creditrule=%E6%AF%8F%E5%A4%A9%E7%99%BB%E5%BD%95; yaozh_user=956263%09moox2020; db_w_auth=799970%09moox2020; yaozh_logintime=1595143780; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyXnoaabJtnm5eHnKZxanJT1qeSoMZYoNdzb5tan9LU2pOUlpFZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmJ6824f776949bb1CF89325c86aF17C1CB7XkpiWmWiYV6DXn5VtWamhnsZbbKabZ5ieW2iWcWeUmZWYnJWabZlXoOE%3D268bbfec91229863de4864edb7fed7c2; UtzD_f52b_lastact=1595143781%09uc.php%09; UtzD_f52b_auth=e555PLVOXByCsyZ5dlANKt5j1jodJkCYtvA%2B8h7Gd0svI4J%2FQA9SPzcUIlFOd8l2cZdPn7W2nKBuF7N5Zfe9e2MbhSQ'
## 此处的cookies需要一个dict或者cookiejar类型的,上述的Cookie是字符串类型,不能使用,需要转化为dict
# cookies_dict = {
# 'PHPSESSID':'qp7vu5k86b80o99nk3ne2nqeo6',
# 'Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94':'1595128086',
# '_ga':'GA1.2.''.1595128086',
# '_gid':'GA1.2.1688163507.1595128086',
# 'yaozh_userId':'956263',
# '_gat':'1',
# 'Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94':'1595128158',
# 'yaozh_uidhas':'1',
# 'yaozh_mylogin':'1595128161',
# 'acw_tc':'2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574',
# 'UtzD_f52b_saltkey':'gtc7UEo6',
# 'UtzD_f52b_lastvisit':'1595126252',
# '_ga':'GA1.1.402802462.1595129856',
# '_gid':'GA1.1.1215858104.1595129856',
# 'UtzD_f52b_ulastactivity':'1595128155%7C0',
# 'UtzD_f52b_creditnotice':'0D0D2D0D0D0D0D0D0D799970',
# 'UtzD_f52b_creditbase':'0D0D0D0D0D0D0D0D0',
# 'UtzD_f52b_creditrule':'%E6%AF%8F%E5%A4%A9%E7%99%BB%E5%BD%95',
# 'yaozh_user':'956263%09moox2020',
# 'db_w_auth':'799970%09moox2020',
# 'yaozh_logintime':'1595143780',
# 'yaozh_jobstatus':'kptta67UcJieW6zKnFSe2JyXnoaabJtnm5eHnKZxanJT1qeSoMZYoNdzb5tan9LU2pOUlpFZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmJ6824f776949bb1CF89325c86aF17C1CB7XkpiWmWiYV6DXn5VtWamhnsZbbKabZ5ieW2iWcWeUmZWYnJWabZlXoOE%3D268bbfec91229863de4864edb7fed7c2',
# 'UtzD_f52b_lastact':'1595143781%09uc.php%09',
# 'UtzD_f52b_auth':'e555PLVOXByCsyZ5dlANKt5j1jodJkCYtvA%2B8h7Gd0svI4J%2FQA9SPzcUIlFOd8l2cZdPn7W2nKBuF7N5Zfe9e2MbhSQ'
# }
## 以上cookies_dict 使用手工方式处理太繁琐,使用split拆分Cookie为字典
## 方法一:
cookies_dict={}
cookies_list = cookies.split("; ")
for cookies in cookies_list:
cookies_dict[cookies.split("=")[0]]=cookies.split("=")[1]
## 方法二:列表推导式
cookies_dict={
cookies.split("=")[0]:cookies.split("=")[1] for cookies in cookies.split("; ")
}
response = requests.get(url,headers=headers,cookies=cookies_dict)
data = response.content
with open("11.4_cookies_dict.html","wb") as f:
f.write(data)
11.5-requests_cookies_auto_login
import requests
## 11.5-requests_cookies_auto_login
## 代码实现模拟登录,带着cookie访问
## session 类,可以自动保存cookies,类似于urllib.request中使用的cookieJar
session = requests.session()
# 1.代码登录
login_url = "https://www.yaozh.com/login"
member_url = "https://www.yaozh.com/member/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
login_form_data = {
'username': 'moox2020',
'pwd': '',
'formhash': 'E350124FCF', ## formhash和backurl在登录前的页面代码中找
'backurl': '%2F%2Fwww.yaozh.com%2F', ##
}
## 注意:登录成功之后,cookie保存在response中,但此处的实现,使用session,因此不直接使用requests.post
# url_response = requests.post(login_url,data=login_form_data)
## 登录成功后,session就保存了有效的cookie,再使用session去请求即可
login_response = session.post(login_url,headers=headers,data=login_form_data)
print(login_response.content.decode()) ## 登录成功时:json中包含"state":"success"
## json内容可以使用网站查看:http://www.bejson.com/
# 2.登录成功之后,带着有效的cookies 访问请求目标数据,使用session请求
data = session.get(member_url,headers=headers).content
with open("11.5_cookies_auto_login.html","wb") as f:
f.write(data)
3、re