• python爬虫伪装技术应用


    版权声明:本文为博主原创文章,转载 请注明出处: https://blog.csdn.net/sc2079/article/details/82423865

    -写在前面


      本篇博客主要是爬虫伪装技术的应用,包括用户代理,IP代理,Cooikes,适合爬虫初学者。文中如有不足,请指正。
      PS:网上资源整合,如有侵权,联系删除

    -环境配置安装


      运行环境:Python3.6、Spyder
      依赖的模块:urllib,requests.random等

    -正文


    1、用户代理

    def LoadUserAgents(uafile):
        uas = []
        with open(uafile, 'rb') as uaf:
            for ua in uaf.readlines():
                if ua:
                    uas.append(ua.strip()[1:-1 - 1])
        random.shuffle(uas)
        return uas
    uas = LoadUserAgents("user_agents.txt")
    

      下载user_agents.txt文档:链接,设置好路径后,运行即可。

    2、cookies

    Cookies = ['_lxsdk_s=b458c64cc9ef92adce92c199a6b7%7C%7C4; __mta=214784936.1512623224439.1512623224439.1512623226558.2; client-id=8d050bd8-51b6-46fc-be96-cb6871d91fe7; ci=89; webloc_geo=31.778615%2C119.958931%2Cwgs84; _lxsdk=1602f5eb154c8-05c63a41825c168-1b451e24-13c680-1602f5eb154c8; uuid=73a65301-f351-4cdd-bfcd-35ae40aea13c',
               '_lxsdk_s=55618265813fc43043ce562acf29%7C%7C2; ci=89; __mta=252597174.1512623399814.1512623399814.1512623399814.1; _lxsdk=1602f6161c1c8-034ae66c3842d88-1b451e24-13c680-1602f6161c170; client-id=8ca2fe5f-ba96-41ae-b919-f8d3665e2f10; webloc_geo=31.778620%2C119.958935%2Cwgs84; uuid=ae73c608-8db3-4724-b4bf-91f4a8c701c5',
               '_lxsdk_s=4ffb12d5a90c2361745e6d0993a2%7C%7C2; ci=89; __mta=40613533.1512623465821.1512623465821.1512623465821.1; client-id=a31f1c79-3287-4ef1-8bd3-56bfb431a7da; webloc_geo=31.778616%2C119.958936%2Cwgs84; _lxsdk=1602f6262117d-0f173d7fde38d18-1b451e24-13c680-1602f626212c8; uuid=ba1b457e-ae34-4b56-bf7b-9bf3579ac0ae',
               '_lxsdk_s=4ba654e682eaf4d69efb90ae9cf1%7C%7C2; __mta=152351622.1512623525333.1512623525333.1512623525333.1; ci=89; client-id=e8101f58-a778-4332-81c2-75ba410b2bb3; webloc_geo=31.778611%2C119.958934%2Cwgs84; _lxsdk=1602f634c448d-0c74adf6d00e2e8-1b451e24-13c680-1602f634c45c8; uuid=e98a7e32-7511-4b40-ab3d-d88011a06471',
               '_lxsdk_s=f6511ca715103c33b7bb4aea1e9f%7C%7C2; __mta=40629917.1512623573270.1512623573270.1512623573270.1; ci=89; client-id=c69db65e-b5d1-4fe6-aa7a-340e3608aeb6; webloc_geo=31.778617%2C119.958930%2Cwgs84; _lxsdk=1602f64072fc8-04cf33034885b08-1b451e24-13c680-1602f640730c8; uuid=33b52a9f-14ea-4f80-9a97-8386fc9fb0a2',
               '_lxsdk_s=38327d7cfda78d0cfc4810838c32%7C%7C2; __mta=150795529.1512623625315.1512623625315.1512623625315.1; ci=89; client-id=221306b1-bfb7-4cfa-ab3f-2e4a85089864; webloc_geo=31.778610%2C119.958933%2Cwgs84; _lxsdk=1602f64d24531-0623a27320ba75-1b451e24-13c680-1602f64d246c8; uuid=85f7949c-284c-4ae6-b5f8-18aa1c6dc016',
               '_lxsdk_s=c2f111bde916cad4d653e07a35c9%7C%7C2; __mta=42950426.1512623702441.1512623702441.1512623702441.1; _lxsdk=1602f66000dc8-0bb60c750cf17b8-1b451e24-13c680-1602f66000d7f; ci=89; client-id=38cc3a86-fe57-491b-8564-c9b1a251cd4a; webloc_geo=31.778617%2C119.958931%2Cwgs84; uuid=095c6c87-a397-4cc4-a3fc-1c3e35eca3be',
               '_lxsdk_s=fd711080efee1adf6de57c1d4c88%7C%7C2; ci=89; webloc_geo=31.778612%2C119.958939%2Cwgs84; __mta=218070313.1512623752950.1512623752950.1512623752950.1; _lxsdk=1602f66c53e35-0b363d23b0e435-1b451e24-13c680-1602f66c53fc8; client-id=1a4a2af3-4201-4e61-bf26-92016709444b; uuid=9361e591-ada9-43c0-a762-b7b5559c649a',
               '_lxsdk_s=810677247f21b53cd75826e60cb8%7C%7C2; __mta=146594315.1512623869611.1512623869611.1512623869611.1; _lxsdk=1602f688ce80-074a1ec97ff043-1b451e24-13c680-1602f688ce9c8; ci=89; client-id=e60488fa-51fe-4d12-99a3-257a5a0853cc; webloc_geo=31.778609%2C119.958928%2Cwgs84; uuid=948eec52-7d10-46ce-b689-92140842f3f0',
               '_lxsdk_s=c44c26606c13c651ab5a20447428%7C%7C2; __mta=89516452.1512623906613.1512623906613.1512623906613.1; _lxsdk=1602f691da0c8-024ebcb15833bc-1b451e24-13c680-1602f691da04b; ci=89; client-id=a6846506-a50a-4612-8106-ff31978802a8; webloc_geo=31.778607%2C119.958929%2Cwgs84; uuid=6ee14a07-2d20-460e-87c2-001806dfdf1b',
               '_lxsdk_s=071dfe26ce64d6e46d03323d2f71%7C%7C2; __mta=210099498.1512623985254.1512623985254.1512623985254.1; ci=89; client-id=f2cea0fa-0d83-479a-a863-f2b7156fb392; webloc_geo=31.778621%2C119.958925%2Cwgs84; _lxsdk=1602f6a4f88c8-0ce1602d9c121a-1b451e24-13c680-1602f6a4f88c8; uuid=122a1bf8-f588-4b2d-a509-2bb45aaf1be9',
               '_lxsdk_s=833017bd50026c6c57b327f1f51a%7C%7C2; __mta=244702778.1512624031536.1512624031536.1512624031536.1; ci=89; client-id=ec7106e4-9dda-4ce0-aa8d-c12288812644; webloc_geo=31.778622%2C119.958921%2Cwgs84; _lxsdk=1602f6b0459c8-011b3f8b51c9328-1b451e24-13c680-1602f6b045ac8; uuid=04d6dbed-d679-4e9d-a0c7-4ea1b2343513',
               '_lxsdk_s=9a68aaba42beb1ebdfc3e726d94e%7C%7C2; ci=89; webloc_geo=31.778629%2C119.958912%2Cwgs84; __mta=45406747.1512624277952.1512624277952.1512624277952.1; _lxsdk=1602f6ec7b84-095c2ecf9438458-1b451e24-13c680-1602f6ec7b9c8; uuid=33acafd1-4a03-4f0f-9572-1ccbfd570efc; client-id=77353c1f-ba56-4ac9-bbb3-c7a689858182',
               '_lxsdk_s=c1508df449b0616ba37c5d9166f2%7C%7C2; __mta=216814121.1512624317347.1512624317347.1512624317347.1; ci=89; client-id=5d8e3ff2-5370-4c08-855e-9d26e5187040; webloc_geo=31.778633%2C119.958912%2Cwgs84; _lxsdk=1602f6f6187a2-029abec138ea508-1b451e24-13c680-1602f6f6188c8; uuid=1a808ef4-511d-44ba-acfd-d6339ddb92f8',
               '_lxsdk_s=7a249ecc9a048cbb826ea3b1f4e7%7C%7C2; __mta=256753588.1512624355644.1512624355644.1512624355644.1; _lxsdk=1602f6ff822c8-01f025c7119f968-1b451e24-13c680-1602f6ff823c8; ci=89; client-id=33339c7c-4dd0-4da8-8617-add389e72438; webloc_geo=31.778634%2C119.958912%2Cwgs84; uuid=92e531c6-4620-453e-ad8d-0e67644404ad',
               '_lxsdk_s=93f6847b5f37e552ae083beb48c2%7C%7C2; __mta=218397606.1512624389136.1512624389136.1512624389136.1; ci=89; client-id=21540b20-0240-48fa-ae1e-c1ce4dba892f; webloc_geo=31.778631%2C119.958925%2Cwgs84; _lxsdk=1602f707a3d3-0d7db668e6d9498-1b451e24-13c680-1602f707a3ec8; uuid=507d006a-6b37-4ff9-9299-88eae1bc0982',
               '_lxsdk_s=0ffdd4e2dddd57d3c05606eb82f3%7C%7C2; __mta=141367821.1512624428164.1512624428164.1512624428164.1; ci=89; client-id=d6e8de67-48b8-4262-b3b3-47851c46db5c; webloc_geo=31.778620%2C119.958977%2Cwgs84; _lxsdk=1602f7112b9c8-09d6e858772c1d8-1b451e24-13c680-1602f7112bac8; uuid=2b6ac2bf-62cf-4ebd-bb68-bd0c258c6b29',
               ]
    

    3、IP代理

    proxies = ['61.155.164.108:3128',
               '116.199.115.79:80',
        '42.245.252.35:80',
        '106.14.51.145:8118',
        '116.199.115.78:80',
        '123.147.165.143:8080',
        '58.62.86.216:9999',
        '202.201.3.121:3128',
        '119.29.201.134:808',
        '61.155.164.112:3128',
        '123.57.76.102:80',
        '116.199.115.78:80',
    ]
    

    4.urllib调用

    def ua(url):
        req = urllib.request.Request(url)
        #req.add_header("User-Agent",uas)
        req.add_header("User-Agent",random.choice(uas))
        req.add_header("Cookie",random.choice(Cookies))
        proxy = urllib.request.ProxyHandler({"https":random.choice(proxies)})
        opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        #将opener安装为全局
        urllib.request.install_opener(opener)
        return req
    
    def get_data(url):
        req = ua(url)
        try:
            data = urllib.request.urlopen(req)
            #print(len(data.read()))
        except Exception as err:
            print(err)
        return data
    
    response = get_data(url)
    contents = response.read().decode('utf-8')
    

    5.requests调用

    def get_data2(url):
        headers={
        'User-Agent':random.choice(uas),
        'proxie':'http:'+random.choice(proxies),
        'Cookie':random.choice(Cookies)
        }
        try:
            r = requests.get(url, timeout=15,headers=headers)
            r.raise_for_status()
            print(r.raise_for_status())
            r.encoding = r.apparent_encoding
            return r
        except:
            return ' '
    
    response = get_data2(url)     
    content=response.text
    

    -总结

      对于我而言,requests比urllib相对更简洁好用一些。在爬虫实际应用中,如果远端网站有较强的反爬虫机制,当访问次数过多后,可能爬虫就不能再爬取数据了,此时应考虑更新用户、IP代理。

    写于2018/9/5/周三 14:46:22

  • 相关阅读:
    获取指定函数的函数名称(用于兼容IE)
    opa gatekeeper笔记:AdmissionReview input.request请求对象结构
    团队内部密码共享方案:KeePassXC+微盘(企业微信)
    一个简单的golang项目,实验 gitlab-ci-cd Pipelines
    调用企业微信API拨打紧急通知电话
    使用PAM模块实现普通用户之间su免密切换
    thin_check命令 man手册
    Nginx server_name翻译
    UDP端口检查告警SHELL脚本(企业微信版机器人版)
    从零搭建vsftpd
  • 原文地址:https://www.cnblogs.com/sc340/p/11870816.html
Copyright © 2020-2023  润新知