利用Python第三方库请求库requests和解析库lxml等工具爬取百度贴吧任意贴吧名里的所有图片:
要求如下:
1、.编程范式------面向对象
2、采取简单的反反爬措施:如请求时间不宜过于频繁、请求头中的User-Agent要隐藏爬取工具且随机随机生成User-Agent规避反爬
3、只爬取吧主发布的图片,其他图片禁止爬取
代码如下:
import requests from lxml import etree import os import time from fake_useragent import UserAgent import warnings import random warnings.filterwarnings('ignore') class BaiduSpider(object): def __init__(self, keyword, page_number): self.url = 'http://tieba.baidu.com/' self.useragent = UserAgent() self.headers = {'User-Agent': self.useragent.random} self.keyword = keyword self.page_number = page_number # 获取帖子链接 def get_tlink(self, data): res = requests.get(self.url, headers=self.headers, params=data) res.encoding = 'utf-8' html = res.text html = html.replace(r"<!--", '').replace(r"-->", '') # print(html) parse_html = etree.HTML(html) t_list = parse_html.xpath( '//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div//a/@href') # print(t_list) for t in t_list: # 拼接每个帖子的链接 t_link = 'http://tieba.baidu.com' + t # 向帖子链接发请求,获取图片链接,向图片链接发请求,保存图片到本地 # print(t_link) self.get_ilink(t_link) # 提取图片链接 def get_ilink(self, t_link): res = requests.get(t_link, headers=self.headers) res.encoding = 'utf-8' html = res.text parse_html = etree.HTML(html) i_list = parse_html.xpath( '//div[@class="d_post_content_main d_post_content_firstfloor"]//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src') print(i_list) for i in i_list: html = requests.get(i, heasers=self.headers).content self.write_image(html, i) # 保存图片 def write_image(self, html, i): filename = './' + self.keyword + '/' + i[-10:] with open(filename, 'wb') as f: f.write(html) def main(self): if os.path.exists(self.keyword): os.remove(self.keyword) for i in range(1, self.page_number + 1): data = { 'kw': self.keyword, 'pn': str((i - 1) * 50) } self.get_tlink(data) print('第%d页下载完毕' % i) time.sleep(random.randint(1, 10)) if __name__ == "__main__": spider = BaiduSpider('高考吧', 1) spider.main()