• python 百度图片爬虫


    # -*- coding:utf-8 -*-
    #https://blog.csdn.net/qq_32166627/article/details/60882964
    import requests
    import os
    import pinyin
    import simplejson
    
    def getManyPages(keyword,pages):
        params=[]
        for i in range(30,30*pages+30,30):
            params.append({
                          'tn': 'resultjson_com',
                          'ipn': 'rj',
                          'ct': 201326592,
                          'is': '',
                          'fp': 'result',
                          'queryWord': keyword,
                          'cl': 2,
                          'lm': -1,
                          'ie': 'utf-8',
                          'oe': 'utf-8',
                          'adpicid': '',
                          'st': -1,
                          'z': '',
                          'ic': 0,
                          'word': keyword,
                          's': '',
                          'se': '',
                          'tab': '',
                          'width': '',
                          'height': '',
                          'face': 0,
                          'istype': 2,
                          'qc': '',
                          'nc': 1,
                          'fr': '',
                          'pn': i,
                          'rn': 30,
                          'gsm': '1e',
                          '1488942260214': ''
                      })
        url = 'https://image.baidu.com/search/acjson'
        urls = []
        for i in params:
            #print("begin")
            try:
                rgjson = requests.get(url,params=i).json().get('data')
            except simplejson.scanner.JSONDecodeError:
                print('【错误】simplejson.scanner.JSONDecodeError ')
                continue
            #print("end")
            urls.append(rgjson)
    
        return urls
    
    
    def getImg(dataList, localPath, keyword):
    
        if not os.path.exists(localPath):  # 新建文件夹
            os.mkdir(localPath)
    
        x = 0
        for list in dataList:
            for i in list:
                if i.get('thumbURL') != None:
                    #print('download:%s' % i.get('thumbURL'))
                    print("down " + str(x) + " image " + i.get('thumbURL'))
                    ir = requests.get(i.get('thumbURL'))
                    open(localPath +"/" + keyword +  '_%d.jpg' % x, 'wb').write(ir.content)
                    x += 1
                else:
                    print('image not exist')
    
    
    def convert():
        fp = open("stars_list_clean.txt",'w')
        with open("stars_list.txt",'r') as face_file:
            stars_list = face_file.readlines()
            index = 0
            line_record = []
            for line in stars_list:
                line = line.replace('
    ','').replace('
    ','').replace('	','') 
                #print(line)
                line_split = line.strip().split(",")
                print(line_split[1])
                if line_split[1] not in line_record:
                    line_record.append(line_split[1])
                    fp.write('%s
    ' % line_split[1])
                else:
                    print(line_split[1], " is exist")
    
    def debug():
    
        # with open("stars_list_clean.txt",'r') as face_file:
        #   stars_list = face_file.readlines()
        #   index = 0
        #   for line in stars_list:
        #       line = line.replace('
    ','').replace('
    ','').replace('	','')
        #       keyword_english = pinyin.get(line, format="strip")
        #       keyword = line
        #       index += 1
        #       if index > 0:
        #         break
    
        # print(keyword)
        # keyword1 = '胡因梦'
        # if keyword == keyword1:
        #     print("yes")
        # else:
        #     print("no")
        keyword = '胡因梦'
        keyword_english = "hym"
        dataList = getManyPages(keyword,2)  # 参数1:关键字,参数2:要下载的页数
        getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径
    
        # keyword = '韩雪'
        # dataList = getManyPages(keyword,2)  # 参数1:关键字,参数2:要下载的页数
        #getImg(dataList,'./hanxue') # 参数2:指定保存的路径
    
    
    def run():
    
        fp = open("stars_list_en.txt",'w')
        with open("stars_list_clean.txt",'r') as face_file:
            stars_list = face_file.readlines()
            for line in stars_list:
                line = line.replace('
    ','').replace('
    ','').replace('	','')
                keyword_english = pinyin.get(line, format="strip")
                fp.write('%s
    ' % keyword_english)
        face_ID_index = 0
    
        dir = "./stars_srcimg/"
        
        # if os.path.exists(dir):
        #     os.system("rm -rf " + dir)
    
        if not os.path.exists(dir):
            os.mkdir(dir)
    
        pages = 5
        maxnum = pages * 30
        print(maxnum)
    
        for line in stars_list:
            #line.decode('utf-8').encode('gb2312')
            line = line.replace('
    ','').replace('
    ','').replace('	','')
            keyword = line
            print keyword
            keyword_english = pinyin.get(keyword, format="strip")
            print keyword_english
            face_ID = str(face_ID_index) + "_" + keyword
            facesavepath = dir + str(face_ID_index) + "_" + keyword
            face_ID_index += 1
            print facesavepath
            if not os.path.exists(facesavepath):
                os.mkdir(facesavepath)
            else:
                print(keyword, " exist")
                continue
    
            print("down "  + keyword)
    
            dataList = getManyPages(keyword, pages)  # 参数1:关键字,参数2:要下载的页数
            getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径
    
    if __name__ == '__main__':
      debug()
      #run()
  • 相关阅读:
    Otter详解
    为什么要使用Netty
    haproxy实现mysql集群负载均衡
    Mysql主从复制
    java编程思想读书笔记三(HashMap详解)
    代码界的石器时代
    补码的产生与应用
    java编程思想读书笔记二(对象的创建)
    java编程思想读书笔记一(面向对象)
    Apache VFS
  • 原文地址:https://www.cnblogs.com/adong7639/p/9074012.html
Copyright © 2020-2023  润新知