试了一下爬取今日头条的组图。
首先是进入输入关键词后的索引页,使用chrom的开发者工具可以看到这是一个get请求,且包含一些数据,于是在这一步应该构造这个get请求,请求成功则会返回一个json数据。
def get_page_index(offset,keyword): data = { 'offset':offset, 'format':'json', 'keyword':keyword, 'autoload':'true', 'count':'20', 'cur_tab':3 } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except RequestException: print("请求索引页失败") return None
对于上一步返回的数据进行解析,取出需要的article_url字段
def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url')
对每个组图的url进行请求:
def get_detail_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: print("请求详情页面出错") return None
进行解析:
def parse_detail_page(html,url): pattern = re.compile('BASE_DATA.galleryInfo = (.*?);',re.S) result = re.search(pattern,html) data = result.group(1) pattern_title = re.compile('title:(.*?),',re.S) #print(result) result2 = re.search(pattern_title,data) #print(data) title = result2.group(1) pattern_image = re.compile('gallery: JSON.parse("(.*?)")') result3 = re.search(pattern_image,data) #print(result3.group(1)) jsonStr = re.sub(r'\{1,2}', '',result3.group(1)) #print(jsonStr) if result3: data_image = json.loads(jsonStr) if data_image and 'sub_images' in data_image.keys(): sub_images = data_image.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return {"title":title, "url":url, "images":images}
对于解析出的每个图片信息保存至MongoDB:
def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print("存储到MongoDB成功") return True return False def download_image(url): try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except RequestException: print("下载图片出错") return None
将图片保存至本地:
def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(content) f.close()
一部分保存下来的图片:
!!!!