1.爬虫的本质是什么?
模仿浏览器的行为,爬取网页信息。
2.requests
1.get请求
无参数实例 import requests ret = requests.get('https://github.com/timeline.json') print ret.text 有参数实例 import requests ret = requests.get("http://httpbin.org/get", params= {'key1': 'value1', 'key2': 'value2'}) print ret.text
2.post请求
import requests import json url = 'https://api.github.com/some/endpoint' payload = {'v1': 'k1} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text
3.其他请求
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基础上构建 requests.request(method, url, **kwargs)
4.更多参数与实例
- method
def param_method_url(): ret=requests.request(method='get', url='http://127.0.0.1:8000/test/') ret=requests.request(method='post', url='http://127.0.0.1:8000/test/')
- params
import requests requests.get(url='http://127.0.0.1:8000/test/', params={'k1': 'v1', 'k2': 'v2'}) #他的本质与requests.get(url='xxxxx?k1=v1&k2=v2')
- data
# 可以是字典 # 可以是字符串 # 可以是字节 # 可以是文件对象 # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data={'k1': 'v1', 'k2': '水电费'}) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1; k2=v2; k3=v3; k3=v4" # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1;k2=v2;k3=v3;k3=v4", # headers={'Content-Type': 'application/x-www-form-urlencoded'} # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4 # headers={'Content-Type': 'application/x-www-form-urlencoded'} # )
- json
#如果请求体是 payload的话则需要传入json格式 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水电费'})
- cookies
ret1 = requests.get( url='https://dig.chouti.com/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } ) ret1_cookies = ret1.cookies.get_dict() #获取的ret1.cookies是访问该url返回的cookies对象 #通过get_dict()获取到字典类型的cookies
-
headers
# 发送请求头到服务器端 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水电费'}, headers={'Content-Type': 'application/x-www-form-urlencoded'} ) #具体需要什么请求头要看服务器端
-
files
# 发送文件 # file_dict = { # 'f1': open('readme', 'rb') # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 发送文件,定制文件名 # file_dict = { # 'f1': ('test.txt', open('readme', 'rb')) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 发送文件,定制文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf") # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 发送文件,定制文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'}) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) pass
- timeout
设置超时时间,如果访问超过超时时间就停止访问 # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) pass
- allow_redirects
#是否允许重定向,默认为true ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text)
BeautifulSoup
该模块可以将接收到的html和xml进行格式化,通过操作对象的方式快速的找到想要的标签
- 使用实例
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> ... </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml")
- name--->标签名
# tag = soup.find('a') # name = tag.name # 获取 # print(name) # tag.name = 'span' # 设置
- attr--->标签属性
# tag = soup.find('a') # attrs = tag.attrs # 获取 # print(attrs) # tag.attrs = {'ik':123} # 设置 # tag.attrs['id'] = 'iiiii' # 设置
- children--->所有子标签
# body = soup.find('body') # v = body.children
- descendants 所有后代
# body = soup.find('body') # v = body.descendants
- clear--->将标签的所有子标签全部清空(保留标签名)
# tag = soup.find('body') # tag.clear() # print(soup)
- extract,递归的删除所有的标签,并获取删除的标签
#body = soup.find('body') # v = body.extract() # print(soup)
- find,获取匹配的第一个标签
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
- find_all,获取匹配的所有标签
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v)
- has_attr,检查标签是否具有该属性
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
爬取汽车之家实例
import requests from bs4 import BeautifulSoup # 这个模块解析html # 下载页面 ret = requests.get(url='https://www.autohome.com.cn/news/') # print(ret.apparent_encoding)#爬取编码格式 # print(ret.content) # ret.encoding = 'gbk' ret.encoding=ret.apparent_encoding # print(ret.text) # 页面解析.获取想要的内容 soup = BeautifulSoup(ret.text,features='html.parser') # 公司用 lxml(需要单独安装) # find是匹配成功的第一个 div =soup.find(name='div',id='auto-channel-lazyload-article') #如果有class 匹配的时候: # *****div = soup.find(name='div',attrs={'class':'dddd','id':'dfa'})***** li_list=div.find_all(name='li') # find_all返回的是一个列表 不能够用.find # print(li_list) for row in li_list: h3=row.find(name='h3') if not h3: continue a=row.find(name='a') print(a.get('href')) p = row.find(name='p') print(p.text) li_img= row.find(name='img') src= li_img.get('src') file_name = src.rsplit('__',maxsplit=1)[1] ret_img = requests.get('https:'+src) with open(file_name,'wb') as f: f.write(ret_img.content)
抽屉实例
import requests from bs4 import BeautifulSoup # 第一次访问返回未授权的cookie值 ret1 = requests.get( url='https://dig.chouti.com/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }) ret1_cookies = ret1.cookies.get_dict() # 登录成功之后cookie值已经授权 ret = requests.post( url='https://dig.chouti.com/login', data={ 'phone':'8613612201458', 'password':'wo3384451', 'oneMonth':'1' }, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }, cookies = ret1_cookies, ) for num_page in range(2,10): ret_index= requests.get(url='https://dig.chouti.com/all/hot/recent/%s'%(num_page), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }, ) soup = BeautifulSoup(ret_index.text,'html.parser') div = soup.find(name='div',id='content-list') item_list = div.find_all(attrs={'class':'part2'}) for item in item_list: num = item.get('share-linkid') # 此时带着已经授权的cookie值去点赞 ret3 = requests.post( url='https://dig.chouti.com/link/vote?linksId=%s'%(num), # data={'linksId':'%s'%(num)}, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }, cookies = ret1_cookies ) print(ret3.text)
Github实例
import requests import re from bs4 import BeautifulSoup class Github(object): def __init__(self,username=None,passward=None): self.username=username self.passward=passward self.all_cookies={} self.process() def process(self): if not (self.username and self.passward): raise Exception('请输入用户名和密码') self.get_login_key() def get_login_key(self): # 获取authenticity_token login_result = requests.get( url='https://github.com/login', headers={ 'Host': 'github.com', } ) auth_key =BS4xpath.get_auth_key(login_result.text) self.all_cookies = login_result.cookies.get_dict() self.login(auth_key) def login(self,auth_key): # 登录获取已经登录的cookies login_result = requests.post( url='https://github.com/session', headers={ 'Upgrade-Insecure-Requests': '1', 'Host': 'github.com', }, data={ 'utf8': '✓', 'authenticity_token':auth_key, 'login': self.username, 'password': self.passward, 'commit': 'Sign in' }, cookies=self.all_cookies ) self.all_cookies.update(login_result.cookies.get_dict()) if self.all_cookies['logged_in']=='no': raise Exception('用户名或密码错误') def get_msg(self): msg_obj = requests.get( url='https://github.com/settings/profile', headers={ 'Host': 'github.com', 'Referer': 'https://github.com/', }, cookies=self.all_cookies ) msg=BS4xpath.get_msg_dict(msg_obj.text) return msg class BS4xpath(object): @classmethod def get_auth_key(self,text): soup = BeautifulSoup(text,'html.parser') auth_key=soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value') return auth_key @classmethod def get_msg_dict(self,text): response = {} ret2_data = BeautifulSoup(text,'html.parser') div = ret2_data.find(name='div', attrs={'class': "column two-thirds"}) dl_list = div.find_all(name='dl', attrs={'class': "form-group"}) for row in dl_list: rowname = row.find('label').text dd_input = row.find('input') if dd_input: response[rowname] = dd_input.get('value') return response obj = Github(username='a3384451',passward='wo3384451') ret = obj.get_msg() print(ret)
拉勾网实例
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import requests all_cookie = {} # ############### 1. 查看登录页面 ############### r1 = requests.get( url='https://passport.lagou.com/login/login.html', headers={ 'Host': 'passport.lagou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } ) all_cookie.update(r1.cookies.get_dict()) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # ############### 2. 用户名密码登录 ############### r2 = requests.post( url='https://passport.lagou.com/login/login.json', headers={ 'Host': 'passport.lagou.com', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data={ 'isValidate': True, 'username': '15131255089', 'password': 'ab18d270d7126ea65915cc22c0d', 'request_form_verifyCode': '', 'submit': '', }, cookies=r1.cookies.get_dict() ) all_cookie.update(r2.cookies.get_dict()) # ############### 3. 用户授权 ############### r3 = requests.get( url='https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r3.cookies.get_dict()) # ############### 4. 用户认证 ############### r4 = requests.get( url=r3.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r4.cookies.get_dict()) r5 = requests.get( url=r4.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r5.cookies.get_dict()) r6 = requests.get( url=r5.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r6.cookies.get_dict()) r7 = requests.get( url=r6.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r7.cookies.get_dict()) # ############### 5. 查看个人页面 ############### r5 = requests.get( url='https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, cookies=all_cookie ) print('武沛齐' in r5.text) # ############### 6. 查看 ############### r6 = requests.get( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-L-REQ-HEADER': "{deviceType:1}", 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', }, cookies=all_cookie ) r6_json = r6.json() all_cookie.update(r6.cookies.get_dict()) # ############### 7. 修改个人信息 ############### r7 = requests.put( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', 'X-Anit-Forge-Code': r6_json['submitCode'], 'X-Anit-Forge-Token': r6_json['submitToken'], 'X-L-REQ-HEADER': "{deviceType:1}", }, cookies=all_cookie, json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png", "positionName": '...', "introduce": '....'} ) print(r7.text)
防止xss攻击
from bs4 import BeautifulSoup class XSSFilter(object): __instance = None def __init__(self): # XSS白名单 self.valid_tags = { "font": ['color', 'size', 'face', 'style'], 'b': [], 'div': [], "span": [], "table": [ 'border', 'cellspacing', 'cellpadding' ], 'th': [ 'colspan', 'rowspan' ], 'td': [ 'colspan', 'rowspan' ], "a": ['href', 'target', 'name'], "img": ['src', 'alt', 'title'], 'p': ['align'], "pre": ['class'], "hr": ['class'], 'strong': [] } def __new__(cls, *args, **kwargs): if not cls.__instance: obj = object.__new__(cls, *args, **kwargs) cls.__instance = obj return cls.__instance def process(self, content): soup = BeautifulSoup(content, 'html.parser') # 遍历所有HTML标签 for tag in soup.find_all(): # 判断标签名是否在白名单中 if tag.name not in self.valid_tags: tag.hidden = True if tag.name not in ['html', 'body']: tag.hidden = True tag.clear() continue # 当前标签的所有属性白名单 attr_rules = self.valid_tags[tag.name] keys = list(tag.attrs.keys()) for key in keys: if key not in attr_rules: del tag[key] return soup.decode() #这里返回的就是过滤完的内容 content=""" <p class='c1' id='i1'> asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf </p> <p> <strong class='c2' id='i2'>asdf</strong> <script>alert(123)</script> </p> <h2> asdf </h2> """ content = XSSFilter().process(content) print('content',content)
总结:
- 如果爬取的网站有反爬措施,请求里模仿浏览器发给服务器端
- 如果需要需要携带信息过去的
- 去服务器返回的内容里找.如果有将他格式化成字典或其他保存在session
- 看到159900098这样格式的一般都是时间戳,但是位数需要自己 观察
- 如果服务器返回的内容里没有key,那么去html或者js找相应的数据
- 可能下一次的操作需要携带着上一次服务器发过来的key或其他
- 状态码:
- 3开头的状态码是自动跳转.在自动跳转的时候可能进行cookies认证
- 注意Response request 里的set-cookies参数
参考:http://www.cnblogs.com/wupeiqi/articles/6283017.html
官方文档:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4