#coding=utf-8 import os from time import sleep import requests import re from bs4 import BeautifulSoup headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; ' '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.' 'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;' ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY' 'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi' '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456' '.1527319890.2; __utmb=94650624.3.10.1527319890', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/66.0.3359.181 Safari/537.36'} def get_img_list(url): response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') img_list = soup.find_all('img', class_='ui image lazy') return img_list def validateTitle(title): rstr = r"[/\:*?"<>| ]" # '/ : * ? " < > |' new_title = re.sub(rstr, "_", title) # 替换为下划线 new_title = new_title[0:20] return new_title try: path = "d:/crawl1/" #_url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html' _url = 'https://fabiaoqing.com/tag/detail/id/{page}.html' urls = [_url.format(page=page) for page in range(1, 54673+1)] for real_url in urls: # https: // fabiaoqing.com / tag / detail / id / 2 / page / 227. # html # https: // fabiaoqing.com / tag / detail / id / 2. # html tag_id = real_url.split('/')[-1].split('.')[-2]; for i in range(1,300): if i != 1: child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+"/page/"+str(i)+".html" else : child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+".html" print('crawl url ' + child_url) img_list = get_img_list(child_url) for img in img_list: try: image = img.get('data-original') pattern = re.compile(r'http://wxl.sinaimg.cn.*') # 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None title = img.get('title') title = validateTitle(title); with open(path + title + os.path.splitext(image)[-1], 'wb') as f: img = requests.get(image).content f.write(img) except Exception as e: print(str(e)) except Exception as e: print(str(e))