• 爬取电影网站链接并进入网盘yunfile通过验证码下载的python


    完整代码在github地址https://github.com/StydyOnce/yunfile

    import re
    import requests
    import pytesseract
    import time
    from PIL import Image
    from bs4 import BeautifulSoup
    
    headers = {"User-Agent": "Mozil",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
               "Accept-Language": "zh-CN,zh;q=0.9",
               "Referer": "http:"
               }
    
    session = requests.Session()            #可以记录所有请求的cookie
    
    # 二值化,即超过下面这个值的像素点二值为黑否则为白
    threshold = 100
    table = []
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    
        # 由于都是数字
    # 对于识别成字母的 采用该表进行修正
    rep = {'O': '0',
           'I': '1',
           'L': '1',
           'Z': '2',
           'S': '8',
           '$': '6'
           }
    def get_verify_code(url):                   #url为初进网盘地址
        image_name = 'picture.jpg'
        a = session.get("http:", headers = headers)
        #session.cookies.update({"referer": r"http%3A%2F%2Fpage4.dfpan.com%2Ffile%2Fcaixi89%2F1ea113e0"})
        r = session.get('http://page4.dfpan.com/verifyimg/getPcv.html', headers = headers)
    
        with open(image_name, 'wb') as file:
            file.write(r.content)
    
        #打开图片
        image = Image.open('picture.jpg')
    
        #转化到灰度图
        imgry = image.convert('L')
    
        #保存图像
        imgry.save('g'+image_name)
    
        #二值化,采用阈值分割法,threshold为分割点
        out = imgry.point(table, '1')
        out.save('b'+image_name)
    
        #识别
        text = pytesseract.image_to_string(out)
    
        #对识别后的验证码人为处理
        text = text.strip()
        text = text.upper();
        for r in rep:
            text = text.replace(r, rep[r])
        #将非数字的字符进行替换
        text = re.sub("[^0-9]", "", text)
    
        if(len(text)!= 4):                  #if num is not equal 4, then it must be wrong and get verify code again
            text = get_verify_code("1")
        print(text)
        return text
    
    def build_info(data):
        #print(data)
        soup = BeautifulSoup(data, "html.parser")
        ret = soup.find_all(name='table', attrs='style')
        print(ret)
    
    def get_new_url(url):
        verify_code = get_verify_code(url)
        print(verify_code)
        new_url = str(url) +str(verify_code) + '.html'
        print(new_url)
        #time.sleep(30)
        response = session.get(new_url, headers = headers)
        print(response.status_code)
        '''print(response.status_code)  # 打印状态码
        print(response.url)  # 打印请求url
        print(response.headers)  # 打印头信息
        print(response.cookies)  # 打印cookie信息
        print(response.text)  # 以文本形式打印网页源码
        print(response.content)  # 以字节流形式打印
        '''
    
        if (response.status_code == 302):
            print("aleady new location")
        elif(response.status_code!=200):
            print("aleady new location")
        else:
            #print(response.text)
            build_info(response.text)
    
    if __name__ == "__main__":
    
        get_new_url("")
  • 相关阅读:
    !clrstack未显示任何方法名
    !dumpheap参数和SOS帮助系统的一些一般信息
    WinDbg Script---显示RCW对象引用的COM对象
    为什么不能将2个调试器附加到1个进程
    什么是互操作调试(Interop-Debugging)?
    为什么托管调试与本机调试不同?
    在WinDBG中管理源代码窗口
    如何从转储文件确定 /LARGEADDRESSAWARE
    Dump文件数据存储格式(九)
    Windbg命令系列---!cpuid(CPU信息)
  • 原文地址:https://www.cnblogs.com/wangshaowei/p/8678105.html
Copyright © 2020-2023  润新知