爬取电影网站链接并进入网盘yunfile通过验证码下载的python

完整代码在github地址https://github.com/StydyOnce/yunfile

import re
import requests
import pytesseract
import time
from PIL import Image
from bs4 import BeautifulSoup

headers = {"User-Agent": "Mozil",
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
           "Accept-Language": "zh-CN,zh;q=0.9",
           "Referer": "http:"
           }

session = requests.Session()            #可以记录所有请求的cookie

# 二值化,即超过下面这个值的像素点二值为黑否则为白
threshold = 100
table = []
for i in range(256):
    if i < threshold:
        table.append(0)
    else:
        table.append(1)

    # 由于都是数字
# 对于识别成字母的 采用该表进行修正
rep = {'O': '0',
       'I': '1',
       'L': '1',
       'Z': '2',
       'S': '8',
       '$': '6'
       }
def get_verify_code(url):                   #url为初进网盘地址
    image_name = 'picture.jpg'
    a = session.get("http:", headers = headers)
    #session.cookies.update({"referer": r"http%3A%2F%2Fpage4.dfpan.com%2Ffile%2Fcaixi89%2F1ea113e0"})
    r = session.get('http://page4.dfpan.com/verifyimg/getPcv.html', headers = headers)

    with open(image_name, 'wb') as file:
        file.write(r.content)

    #打开图片
    image = Image.open('picture.jpg')

    #转化到灰度图
    imgry = image.convert('L')

    #保存图像
    imgry.save('g'+image_name)

    #二值化，采用阈值分割法，threshold为分割点
    out = imgry.point(table, '1')
    out.save('b'+image_name)

    #识别
    text = pytesseract.image_to_string(out)

    #对识别后的验证码人为处理
    text = text.strip()
    text = text.upper();
    for r in rep:
        text = text.replace(r, rep[r])
    #将非数字的字符进行替换
    text = re.sub("[^0-9]", "", text)

    if(len(text)!= 4):                  #if num is not equal 4, then it must be wrong and get verify code again
        text = get_verify_code("1")
    print(text)
    return text

def build_info(data):
    #print(data)
    soup = BeautifulSoup(data, "html.parser")
    ret = soup.find_all(name='table', attrs='style')
    print(ret)

def get_new_url(url):
    verify_code = get_verify_code(url)
    print(verify_code)
    new_url = str(url) +str(verify_code) + '.html'
    print(new_url)
    #time.sleep(30)
    response = session.get(new_url, headers = headers)
    print(response.status_code)
    '''print(response.status_code)  # 打印状态码
    print(response.url)  # 打印请求url
    print(response.headers)  # 打印头信息
    print(response.cookies)  # 打印cookie信息
    print(response.text)  # 以文本形式打印网页源码
    print(response.content)  # 以字节流形式打印
    '''

    if (response.status_code == 302):
        print("aleady new location")
    elif(response.status_code!=200):
        print("aleady new location")
    else:
        #print(response.text)
        build_info(response.text)

if __name__ == "__main__":

    get_new_url("")

相关阅读:
!clrstack未显示任何方法名
 !dumpheap参数和SOS帮助系统的一些一般信息
 WinDbg Script---显示RCW对象引用的COM对象
 为什么不能将2个调试器附加到1个进程
 什么是互操作调试(Interop-Debugging)？
为什么托管调试与本机调试不同？
在WinDBG中管理源代码窗口
 如何从转储文件确定 /LARGEADDRESSAWARE
Dump文件数据存储格式(九）
Windbg命令系列---!cpuid(CPU信息)
原文地址：https://www.cnblogs.com/wangshaowei/p/8678105.html