import os
"""
1.先获取需要爬取的url[生成器]
2.找到详情页的url
3.找到对应的标题
4.保存
"""
from requests_html import HTMLSession
class Spider():
def __init__(self):
self.session = HTMLSession()
def get_url(self):
for i in range(1, 4):
if i == 1:
yield "http://www.xiaohuar.com/meinv/"
yield "http://www.xiaohuar.com/meinv/index_{}.html".format(i)
def get_img_info(self, index_url):
r = self.session.get(url=index_url)
for element in r.html.find('.items'):
src_url:str = element.find('img', first=True).attrs.get("src")
if not src_url.startswith("http"):
src_url = "http://www.xiaohuar.com" + src_url
title_name = element.find('.p_title', first=True).text
yield src_url, title_name
# print(src_url, title_name)
def save(self, src_url, title_name):
r = self.session.get(url=src_url)
title = title_name.replace("\", '').replace("/", '').replace('|', '')
path = os.path.join("好看", title + ".jpg")
with open(path, 'wb') as f:
f.write(r.content)
print('%s下载完成'%(title))
def run(self):
for index_url in self.get_url():
for src_url, title_name in self.get_img_info(index_url):
self.save(src_url, title_name)
if __name__ == '__main__':
spider = Spider()
spider.run()