• 爬虫-爬取美少女壁纸


    1. 有时候想找些好看的壁纸,一个一个下载太慢了,作为一个菜鸡程序员,还是会点爬虫的,说到爬虫,当时还是python香了,说干就干

    import requests
    import  re,os
    from lxml import etree
    import threading
    
    
    def getUrl(i):
        url_list=[]
        page_url="https://www.bizhizu.cn/search/动漫/"+str(i)+".html"
        page_res=requests.get(page_url).text
        html = etree.HTML(page_res)
        for i in range(1,19):
    
            html_data = html.xpath('//*[@class="imgcont"]/ul/li['+str(i)+']/a[1]/@href')
            name = html.xpath('//*[@class="imgcont"]/ul/li['+str(i)+']/a[2]/text()')
    
            if len(html_data) > 0 and len(name)>0:
                path_name = []
                path_name.append(html_data[0])
                # print(path_name)
                path_name.append(name[0])
                url_list.append(path_name)
    
        print(url_list)
        return url_list
    
    
    def saveImg(url,path_name):
        # url = "https://www.bizhizu.cn/pic/62690.html"
        res = requests.get(url).text
        html = etree.HTML(res)
    
        if ("" in path_name) or ("" in path_name) or ("" in path_name) or ("少女" in path_name):#你懂我意思吧
            r_name = path_name.replace("/", "-")
            if  not os.path.exists("/Volumes/HD2/downloadpic/" + r_name):
                os.mkdir("/Volumes/HD2/downloadpic/" + r_name)
            else:
                pass
            for i in range(1, 10):
                html_data = html.xpath('//*[@id="thumb"]/li[' + str(i) + ']/a/img/@src')
                # print(html_data)
    
                if len(html_data) > 0:
                    img_url = re.findall(r"(https://.*?.jpg).220.146.jpg", html_data[0])
                    # print(img_url)
                    if  len(img_url)>0:
                        img = requests.get(img_url[0]+".source.jpg")
                        img_url_name = re.findall(r"https://uploadfile.bizhizu.cn/up/.*/.*/.*/(.*?).jpg.220.146.jpg",
                                                  html_data[0])
                        r_name = path_name.replace("/", "-")
                        f = open("/Volumes/HD2/downloadpic/" + r_name + "/" + img_url_name[0] + ".jpg", 'ab')  # 存储图片,多媒体文件需要参数b(二进制文件)
                        f.write(img.content)  # 多媒体存储content
                        f.close()
                        print("保存成功:", path_name + '/' + img_url_name[0] + ".jpg")
    
        else:
            pass
    def demo1():
        for x in range(1,18):
            urlList = getUrl(x)
            for url in urlList:
                saveImg(url[0], url[1])
    
    def demo2():
        for x in range(18,37):
            urlList = getUrl(x)
            for url in urlList:
                saveImg(url[0], url[1])
    
    if __name__=='__main__':
      #开启两个线程 t1
    = threading.Thread(target=demo1) t2 = threading.Thread(target=demo2) t1.start() t2.start()
  • 相关阅读:
    livereload
    XAMPP Apache 配置多端口和多域名方法
    移动端touch事件实现页面弹动--小插件
    QRCode生成二维码,jq QRCode生成二维码,QRCode生成电子名片
    下拉刷新
    js 中获取 日期
    wcf 的 知识点
    iis 发布失败原因总结
    list 用法的随手记
    字符编码的 文章推荐
  • 原文地址:https://www.cnblogs.com/s42-/p/12983525.html
Copyright © 2020-2023  润新知