搜狗图集抓取,类型比较多,但是数据量比较少
import os import time from concurrent.futures.thread import ThreadPoolExecutor import requests import re import json from urllib import parse from pymongo import MongoClient class Save: def __init__(self, host): self.client = MongoClient(host=host, port=27017) self.db = self.client.ImageSet def _save_data_mongodb(self, collect_name, data): self.collect_name = self.db[collect_name] history_record = self.collect_name.find_one({"_id": data['id']}) if history_record: return True else: self.collect_name.update_one({'_id': data['id']}, {'$set': data}, upsert=True) return True class SouHu: def __init__(self, category): self.category_name = category self.category = parse.quote(category) self.image_url_temp = "https://pic.sogou.com/pics/imageddetail2013.jsp?k="+self.category+"&tc=&t=&id=0&d={}" self.start_url = "https://pic.sogou.com/pics?query="+self.category+"&mode=8&dm=11&leftp=44230502&cwidth=1024&cheight=768&st=0&start={}&reqType=ajax&reqFrom=result&tn=0" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } def get_title_id_grpdocs(self, url): print(url) response = requests.get(url=url, headers=self.headers) json_response = json.loads(response.text) json_dict = json_response.get('items') image_content = [] for i in json_dict: item={} try: id = i['mf_id'] title = i['title'] grpdocs = i['grpdocs'] rule = re.compile(r's+') grpdocs = rule.sub(',', grpdocs) item['id'] = id item['title'] = title item['ImageUrl'] = self.image_url_temp.format(grpdocs) image_content.append(item) except Exception as e: print(e) continue print(image_content) return image_content def get_save_content(self, image_content): save_content = [] for image in image_content: item={} url = image['ImageUrl'] response = requests.get(url=url, headers=self.headers) json_response = json.loads(response.text) image_list = [] for i in json_response: try: image_url = i['pic_url'] except: try: image_url = i['ori_pic_url'] except: continue image_list.append(image_url) item['id'] = image['id'] item['title'] = image['title'] item['url'] = image_list save_content.append(item) print(save_content) return save_content def save_(self,save_content): upload_time = time.strftime("%Y-%m-%d", time.localtime()) print("开始写入") for i in save_content: if len(i['url']) < 3: continue collect_name = "搜狗图片" result = Save("localhost")._save_data_mongodb(collect_name, data=i) if result: try: rule = re.compile(r's*', re.S) rule2 = re.compile(r'W*', re.S) title = rule.sub('', i['title']) title = rule2.sub('', title) path = 'D:/搜狗/'+self.category_name+'/' + str(upload_time) + '/' + title except Exception as e: print(e) continue if os.path.exists(path): continue else: os.makedirs(path) try: with open(path + '/content.txt', 'w', encoding='utf8')as fb: fb.write(str([i['title']])) for s in i['url']: a = i['url'].index(s) with open(path + '/{}.jpg'.format(str(a)), 'wb') as f: print(s) response = requests.get(url=s) f.write(response.content) except Exception as e: print(e) continue print(title+" 写入完成") else: continue def run(self, num): url = self.start_url.format(num) image_content = self.get_title_id_grpdocs(url) save_content = self.get_save_content(image_content) self.save_(save_content) if __name__ == '__main__': category = input("输入分类名称:") with ThreadPoolExecutor(10) as executor: sh = SouHu(category) for num in range(2400): executor.submit(sh.run, num)