完整代码在github地址https://github.com/StydyOnce/yunfile
import re import requests import pytesseract import time from PIL import Image from bs4 import BeautifulSoup headers = {"User-Agent": "Mozil", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9", "Referer": "http:" } session = requests.Session() #可以记录所有请求的cookie # 二值化,即超过下面这个值的像素点二值为黑否则为白 threshold = 100 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) # 由于都是数字 # 对于识别成字母的 采用该表进行修正 rep = {'O': '0', 'I': '1', 'L': '1', 'Z': '2', 'S': '8', '$': '6' } def get_verify_code(url): #url为初进网盘地址 image_name = 'picture.jpg' a = session.get("http:", headers = headers) #session.cookies.update({"referer": r"http%3A%2F%2Fpage4.dfpan.com%2Ffile%2Fcaixi89%2F1ea113e0"}) r = session.get('http://page4.dfpan.com/verifyimg/getPcv.html', headers = headers) with open(image_name, 'wb') as file: file.write(r.content) #打开图片 image = Image.open('picture.jpg') #转化到灰度图 imgry = image.convert('L') #保存图像 imgry.save('g'+image_name) #二值化,采用阈值分割法,threshold为分割点 out = imgry.point(table, '1') out.save('b'+image_name) #识别 text = pytesseract.image_to_string(out) #对识别后的验证码人为处理 text = text.strip() text = text.upper(); for r in rep: text = text.replace(r, rep[r]) #将非数字的字符进行替换 text = re.sub("[^0-9]", "", text) if(len(text)!= 4): #if num is not equal 4, then it must be wrong and get verify code again text = get_verify_code("1") print(text) return text def build_info(data): #print(data) soup = BeautifulSoup(data, "html.parser") ret = soup.find_all(name='table', attrs='style') print(ret) def get_new_url(url): verify_code = get_verify_code(url) print(verify_code) new_url = str(url) +str(verify_code) + '.html' print(new_url) #time.sleep(30) response = session.get(new_url, headers = headers) print(response.status_code) '''print(response.status_code) # 打印状态码 print(response.url) # 打印请求url print(response.headers) # 打印头信息 print(response.cookies) # 打印cookie信息 print(response.text) # 以文本形式打印网页源码 print(response.content) # 以字节流形式打印 ''' if (response.status_code == 302): print("aleady new location") elif(response.status_code!=200): print("aleady new location") else: #print(response.text) build_info(response.text) if __name__ == "__main__": get_new_url("")