SBR是JOJO系列我最喜欢的一部,所以今天把漫画爬取到本地,日后慢慢看。
import re import time import requests from requests import codes from bs4 import BeautifulSoup from requests import RequestException def get_page(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def get_pagesNumber(text): soup = BeautifulSoup(text, 'lxml') pagesNumber = soup.find(name='div', class_="d-none vg-r-data") return pagesNumber.attrs['data-total'] def parse_page(text): soup = BeautifulSoup(text, 'lxml') url = soup.find(name='img', class_="img-fluid show-pic") chapter = soup.find(name='h2', class_="h4 text-center") page = soup.find(name='span', class_="c_nav_page") yield { 'url': url['src'], 'chapter': chapter.get_text(), 'page': page.get_text() } #return 在返回结果后 结束函数的运行 #而yield 则是让函数变成一个生成器,生成器每次产生一个值,函数被冻结,被唤醒后再产生一个值 def save_image(item): img_path = 'SBR' + os.path.sep + item.get('chapter') #os.path.sep是路径分隔符 if not os.path.exists(img_path): os.makedirs(img_path) try: resp = requests.get(item.get('url')) if codes.ok == resp.status_code: file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format( file_name=item.get('page'), file_suffix='jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(resp.content) print('Downloaded image path is %s' % file_path) else: print('Already Downloaded', file_path) except Exception as e: print(e) if __name__ == '__main__': for chapter in range(292, 316): #观察可发现共24章节,292到315 彩漫13283, 13306 url = 'https://www.manhuadb.com/manhua/147/4_'+str(chapter)+'.html' text = get_page(url) pagesNumber = get_pagesNumber(text) #获取当前章节总页数 for page in range(1,int(pagesNumber)+1): url = 'https://www.manhuadb.com/manhua/147/4_'+str(chapter)+'_'+str(page)+'.html' #彩漫#url = 'https://www.manhuadb.com/manhua/147/1330_'+str(chapter)+'_'+str(page)+'.html' text = get_page(url) for item in parse_page(text): save_image(item)
最后得到,