昨天能够登陆成功,但是不能使用cookies,今天试了一下requests库的Session(),发现可以保持会话了,代码只是稍作改动。
#-*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import html5lib import re import urllib s = requests.Session() url1 = 'http://accounts.douban.com/login' url2 = 'http://www.douban.com/people/****/contacts' formdata={ "redir":"http://www.douban.com/", "form_email":"*******", "form_password":"******", #'captcha-solution':'blood', #'captcha-id':'cRPGXEYPFHjkfv3u7K4Pm0v1:en', "login":"登录" } headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8", "Referer":"http://accounts.douban.com/login", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36" } r1 = s.post(url1,data=formdata,headers=headers) rcontent = r1.text soup = BeautifulSoup(rcontent,"html5lib") #安装了html5lib没用python本身的html解析库 captchaAddr = soup.find('img',id='captcha_image')['src'] reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/' captchaID = re.findall(reCaptchaID,rcontent) print(captchaID) urllib.request.urlretrieve(captchaAddr,"captcha.jpg") captcha = input('please input the captcha:') formdata['captcha-solution'] = captcha formdata['captcha-id'] = captchaID r1 = s.post(url1,data=formdata,headers=headers) r2 = s.get(url2) f = open('spider2.txt','w',encoding='utf-8') f.write(r2.text) f.close()