• 爬取表情


    #coding=utf-8
    import os
    from time import sleep
    
    import requests
    import re
    from bs4 import BeautifulSoup
    
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
               'Accept-Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-CN,zh;q=0.9',
               'Connection': 'keep-alive',
               'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; '
                         '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.'
                         'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;'
                         ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY'
                         'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi'
                         '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456'
                         '.1527319890.2; __utmb=94650624.3.10.1527319890',
               'Host': 'music.163.com',
               'Referer': 'http://music.163.com/',
               'Upgrade-Insecure-Requests': '1',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/66.0.3359.181 Safari/537.36'}
    
    
    def get_img_list(url):
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        img_list = soup.find_all('img', class_='ui image lazy')
        return img_list
    
    
    
    def validateTitle(title):
        rstr = r"[/\:*?"<>|
    ]"  # '/  : * ? " < > |'
        new_title = re.sub(rstr, "_", title)  # 替换为下划线
        new_title = new_title[0:20]
        return new_title
    
    
    try:
        path = "d:/crawl1/"
        #_url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
        _url = 'https://fabiaoqing.com/tag/detail/id/{page}.html'
        urls = [_url.format(page=page) for page in range(1, 54673+1)]
        for real_url in urls:
            # https: // fabiaoqing.com / tag / detail / id / 2 / page / 227.
            # html
            # https: // fabiaoqing.com / tag / detail / id / 2.
            # html
            tag_id = real_url.split('/')[-1].split('.')[-2];
            for i in range(1,300):
                if i != 1:
                    child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+"/page/"+str(i)+".html"
                else :
                    child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+".html"
                print('crawl url ' + child_url)
    
                img_list = get_img_list(child_url)
                for img in img_list:
                    try:
    
                           image = img.get('data-original')
    
                           pattern = re.compile(r'http://wxl.sinaimg.cn.*')
                           # 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
                           title = img.get('title')
                           title = validateTitle(title);
                           with open(path + title + os.path.splitext(image)[-1], 'wb') as f:
                                img = requests.get(image).content
                                f.write(img)
                    except Exception as e:
                        print(str(e))
    
    except Exception as e:
        print(str(e))
  • 相关阅读:
    ubuntu命令
    mac获取root权限
    centos7安装解压缩工具 ncompress
    ubuntu17.04 配置go环境变量
    vue.js 拦截器
    ubuntu 安装jdk
    ubuntu安装deb文件
    初识 阿里云 SSL 证书申请
    java之XML
    LanProxy 内网映射穿透
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12409280.html
Copyright © 2020-2023  润新知