# coding=utf-8 from urllib.parse import urlencode import requests from requests.exceptions import RequestException,Timeout import json from bs4 import BeautifulSoup from pymongo import MongoClient from multiprocessing import Pool import os import string from hashlib import md5 def get_response(url): try: headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36" } # proxies = {'http':'118.11.2.3:8080'} response = requests.get(url, headers=headers, timeout=5) print(url + 'request success') return response except Timeout: print(url + 'request timeout') def get_page_index(offset, keyword): data = { "offset": offset, "format": "json", "keyword": keyword, "autoload": "true", "count": "20", "cur_tab": "1", "from":"search_tab" } url = "https://www.toutiao.com/search_content/?" + urlencode(data) print(url) try: response = get_response(url) print(response.status_code) if response.status_code == 200: return response.text return None except RequestException: print('request error') return None def conn_mongodb(): client = MongoClient('localhost', 27017) db = client['jiepai'] jiepai = db['jiepai'] return jiepai def save_image_url(data): jiepai = conn_mongodb() jiepai.update({'title':data.get('title')}, {'$set':data}, upsert=True) def get_image_url(): jiepai = conn_mongodb() data = jiepai.find({}, {'title': 1, 'images_list': 1, '_id': 0}) return data def download_image(data): base_dir = os.path.abspath(os.path.dirname(__file__)) if not os.path.exists(base_dir + 'jiepai'): os.mkdir(base_dir + 'jiepai') for item in data: print(item.get('title')) title = item.get('title') images_list = item.get('images_list') print('images_lsit',images_list) # every file name file_name = title.strip(string.punctuation) file_name = str(file_name).replace('?','') if not os.path.exists(base_dir + 'jiepai/' + file_name): os.mkdir(base_dir + 'jiepai\' + file_name) # save images path file_path = base_dir + 'jiepai\' + file_name for image_url in images_list: print(image_url) response = get_response(image_url) html = response.content image_name = md5(html).hexdigest() + '.jpg' with open(file_path + '\' + image_name, 'wb') as f: f.write(html) print('download success') def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): a_gourp_image_detail = {} images_list = [] title = item.get('title') # print(title) if title is not None: a_gourp_image_detail['title'] = title images = item.get('image_detail') # print(images) if images: for image in images: # print(image.get('url')) images_list.append(image.get('url')) # if images_list: a_gourp_image_detail['images_list'] = list(set(images_list)) print(a_gourp_image_detail) save_image_url(a_gourp_image_detail) def main(offset): html = get_page_index(offset, '街拍') # print(html) parse_page_index(html) if __name__ == "__main__": # 多进程爬取图片链接,并保存到 Mongodb # groups = [x*20 for x in range(0,5)] # pool = Pool() # pool.map(main, groups) # 从 mongodb 中获取链接,多进程下载图片,并保存 data = get_image_url() datas = [item for item in data] pool = Pool() pool.map(download_image, data) # download_image()