python内置的爬虫模块
- urllib
- requests
定义
requests模块:python中原生的一款基于网络请求的模块,功能非常强大,简单便捷,效率极高。
如何使用
- 指定url
- UA伪装
- 请求参数处理
- 发起请求
- 获取响应数据
- 持久化存储
安装
pip install requests
实战
- 需求: 爬取搜狗首页的页面数据
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#- 需求:爬取搜狗首页的页面数据
import requests
if __name__ == "__main__":
#step_1:指定url
url = 'https://www.sogou.com/'
#step_2:发起请求
#get方法会返回一个响应对象
response = requests.get(url=url)
#step_3:获取响应数据.text返回的是字符串形式的响应数据
page_text = response.text
print(page_text)
#step_4:持久化存储
with open('./sogou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
print('爬取数据结束!!!')
- 需求: 破解百度翻译
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
if __name__ == "__main__":
#1.指定url
post_url = 'https://fanyi.baidu.com/sug'
#2.进行UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
#3.post请求参数处理(同get请求一致)
word = input('enter a word:')
data = {
'kw':word
}
#4.请求发送
response = requests.post(url=post_url,data=data,headers=headers)
#5.获取响应数据:json()方法返回的是obj(如果确认响应数据是json类型的,才可以使用json())
dic_obj = response.json()
#持久化存储
fileName = word+'.json'
fp = open(fileName,'w',encoding='utf-8')
json.dump(dic_obj,fp=fp,ensure_ascii=False)
print('over!!!')
- 需求: 爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
if __name__ == "__main__":
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': '24',
'interval_id': '100:90',
'action':'',
'start': '0',#从库中的第几部电影去取
'limit': '20',#一次取出的个数
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
response = requests.get(url=url,params=param,headers=headers)
list_data = response.json()
fp = open('./douban.json','w',encoding='utf-8')
json.dump(list_data,fp=fp,ensure_ascii=False)
print('over!!!')
- 需求: 爬取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数据
import requests, json
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
data = {
"cname": "",
"pid": "",
"keyword": "深圳",
"pageIndex": 1,
"pageSize": 10
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"
}
response = requests.post(url=url, data=data, headers=headers)
content = response.json()
with open('KFC.json', 'w', encoding='utf-8') as f:
json.dump(content, f, ensure_ascii=False, indent=2)
- 需求: 爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
id_list = [] # 存储企业的id
all_data_list = [] # 存储所有的企业详情数据
#批量获取不同企业的id值
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
#参数的封装
for page in range(1,6):
page = str(page)
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName':'',
'conditionType': '1',
'applyname':'',
'applysn':'',
}
json_ids = requests.post(url=url,headers=headers,data=data).json()
for dic in json_ids['list']:
id_list.append(dic['ID'])
#获取企业详情数据
post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data = {
'id':id
}
detail_json = requests.post(url=post_url,headers=headers,data=data).json()
# print(detail_json,'-------------ending-----------')
all_data_list.append(detail_json)
#持久化存储all_data_list
fp = open('./allData.json','w',encoding='utf-8')
json.dump(all_data_list,fp=fp,ensure_ascii=False)
print('over!!!')