安装requests模块
requests模块的常用方法, 属性, 参数
import requests
ret = requests.get(url='https://www.baidu.com', )
# 原函数: get(url, params=None, **kwargs)
ret.encoding = 'utf-8' # 指定解析数据是使用的编码格式
print(ret.content) # 响应的数据, bytes类型
print(ret.text) # 响应的数据, str类型
print(ret.url) # 当前访问的url
print(ret.headers, type(ret.headers)) # 响应头, 类型<class 'requests.structures.CaseInsensitiveDict'>, 和字典操作类似
print(ret.json()) # 当响应的Content-Type为json时, 可以使用这个方法取json的数据
params = { # get请求URL中携带的请求的参数
"keyword": "O98K",
}
header = { # 请求头信息
"name": "SATH"
}
ret = requests.get(url='http://www.baidu.com', params=params, header=header)
data = { # POST请求携带的参数
"name": "sath"
}
爬虫案例一: 爬取搜狗指定词条搜索后的页面数据
import requests
url = "https://www.sogou.com/web"
params = {
"query": "apple"
}
# 根据对搜狗的请求分析, 发现提交搜索关键字的是https://www.sogou.com/web
# 并且是以get方式发送的请求
# 关键字是query
ret = requests.get(url=url, params=params)
with open('./sogou.html', 'w', encoding='utf-8') as f:
f.write(ret.text)
爬虫案例二: 爬取豆瓣电影分类排行榜中的电影详情数据
import requests
from multiprocessing import Pool
import time
url = 'https://movie.douban.com/j/new_search_subjects'
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
}
movie_title_list = []
def get_movie(start):
params = {
"sort": "U",
"tags": "",
"start": start,
"genres": "喜剧",
}
ret = requests.get(url=url, params=params, headers=header)
if ret.headers['Content-Type'] == "application/json; charset=utf-8":
data = ret.json()["data"]
for movie in data:
movie_title_list.append(movie["title"])
print(movie["title"])
if __name__ == '__main__':
p = Pool(20)
start = time.time()
for n in range(0, 10000, 20):
a = p.apply_async(get_movie, args=(n,))
p.close()
p.join()
print(time.time() - start)
# 14s, 还可以。。。。
爬虫案例三: 爬取肯德基餐厅查询中指定地点的餐厅数据
import requests
import json
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
}
data = {
"cname": "",
"pid": "",
"keyword": "邯郸",
"pageIndex": "1",
"pageSize": "10",
}
ret = requests.post(url=url, headers=header, data=data, params={"op": "keyword"})
res = json.loads(ret.text)
print(res, type(res))
爬虫案例四: 药监局信息爬取
import requests
from multiprocessing import Pool
url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
}
ids = []
for page in range(20, 250):
data = {
"on": "true",
"page": page,
"pageSize": "15",
"productName": "",
"conditionType": "1",
"applyname": "",
"applysn": "",
}
ret = requests.post(url=url, headers=header, data=data)
if ret.headers['Content-Type'] == "application/json;charset=UTF-8":
res = ret.json()["list"]
for n in res:
ids.append(n['ID'])
else:
pass
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
def func(k):
data = {
"id": k
}
r = requests.post(url=url, headers=header, data=data)
if r.headers['Content-Type'] == "application/json;charset=UTF-8":
print(r.json()["businessPerson"])
if __name__ == '__main__':
p = Pool(14)
for k in ids:
s = p.apply_async(func, k)
p.close()
p.join()