• requests ip代理池单ip和多ip设置方式


    reqeusts库,在使用ip代理时,单ip代理和多ip代理的写法不同 
    (目前测试通过,如有错误,请评论指正)

    • 单ip代理模式 
      省去headers等

      import requests
      proxy = {
          'HTTPS': '162.105.30.101:8080'
      }
      url = '爬取链接地址'
      response = requests.get(url,proxies=proxy)
      
      多ip代理模式
    import requests
    #导入random,对ip池随机筛选
    import random
    proxy = [
        {
            'http': 'http://61.135.217.7:80',
            'https': 'http://61.135.217.7:80',
        },
    {
            'http': 'http://118.114.77.47:8080',
            'https': 'http://118.114.77.47:8080',
        },
    {
            'http': 'http://112.114.31.177:808',
            'https': 'http://112.114.31.177:808',
        },
    {
            'http': 'http://183.159.92.117:18118',
            'https': 'http://183.159.92.117:18118',
        },
    {
            'http': 'http://110.73.10.186:8123',
            'https': 'http://110.73.10.186:8123',
        },
    ]
    url = '爬取链接地址'
    response = requests.get(url,proxies=random.choice(proxy))
    

    简单的智联招聘爬虫封装

    import requests
    from bs4 import BeautifulSoup
    import re
    import ssl
    import time
    import random
    
    ssl._create_default_https_context = ssl._create_unverified_context
    
    user_agent = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]
    '''
    代理若出错,替换代理池,但代理池需要更新
    '''
    # proxy = [
    #   {
    #       'http': 'http://61.135.217.7:80',
    #       'https': 'http://61.135.217.7:80',
    #   },
    # {
    #       'http': 'http://118.114.77.47:8080',
    #       'https': 'http://118.114.77.47:8080',
    #   },
    # {
    #       'http': 'http://112.114.31.177:808',
    #       'https': 'http://112.114.31.177:808',
    #   },
    # {
    #       'http': 'http://183.159.92.117:18118',
    #       'https': 'http://183.159.92.117:18118',
    #   },
    # {
    #       'http': 'http://110.73.10.186:8123',
    #       'https': 'http://110.73.10.186:8123',
    #   },
    # ]
    
    def get_job_txt(city,kw,txt_name):
        for i in range(100):
            time.sleep(2)
            url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={2}&kw={0}&sm=0&p={1}'.format(kw,i,city)
    
            response = requests.get(url,headers = {'User-Agent': random.choice(user_agent)}).content.decode()
    
            soup =BeautifulSoup(response,'lxml')
            tables = soup.select('.newlist')[1:]
            if tables:
                for table in tables:
                    job = table.select('.zwmc')[0].text
                    company = table.select('.gsmc')[0].text
                    money = table.select('.zwyx')[0].text
                    place = table.select('.gzdd')[0].text
                    href = table.select('.zwmc')[0].find('a')['href']
                    print(job+'	'+company+'	'+money+'	'+place+'	'+href+'
    ')
                    with open('{0}.txt'.format(txt_name),'a+',encoding='utf-8',errors='ignore') as f:
                        f.write(job+'	'+company+'	'+money+'	'+place+'	'+href+'
    ')
            else:
                print('总页'+ str(i))
                break
    
    
    if __name__ == '__main__':
        city = input('输入城市')
        kw = input('输入岗位')
        txt_name = input('输入储存文件名')
        get_job_txt(city=city,kw=kw,txt_name=txt_name)

    转自https://blog.csdn.net/weixin_35993084/article/details/80770157
  • 相关阅读:
    Python基础——内置函数
    HHKB Mac快捷键使用
    解决EditText在ListView Item中,第一次点击无法获取焦点问题
    Android 设置Spinner默认显示文字
    IOS-静态Cell
    android-创建流式布局,并修改最后一行的最后一个view
    最简单的设置ExitText只能输入数字和字母的方法
    Android studio私人常用快捷键(持续更新)
    IOS-NSNotification(通知)
    Android书单(持续更新)
  • 原文地址:https://www.cnblogs.com/z-x-y/p/9355223.html
Copyright © 2020-2023  润新知