注意:只是文字,其他都行,自己实现吧。
1 import requests 2 from lxml import etree 3 from urllib.request import urlopen, Request 4 5 import time 6 class blog(): 7 8 def __init__(self,url): 9 self.base_url = url 10 self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}#模拟浏览器 11 12 def get_html(self,url): 13 response = requests.get(url, self.headers, timeout = 5)#因为请求方式位get所以,用get请求 14 if response.status_code == 200:#访问正常状态码为200 15 response.encoding = response.apparent_encoding 16 return response.text 17 return None 18 def get_url1(self,html):#获取每个分页内的文章标题和url 19 url_1 = [] 20 name = [] 21 x_html = etree.HTML(html) 22 #print(x_html) 23 url_1 = x_html.xpath('//a[@class="postTitle2"]/@href') 24 name = x_html.xpath('//a[@class="postTitle2"]/text()') 25 names = [] 26 27 for i in name: 28 names.append(i.strip())#去除空格 29 if names: 30 return names, url_1 31 else: 32 return None 33 def get_url2(self,s): 34 35 url = self.base_url + "default.html?page=" + s 36 return url 37 def get_text(self,html): 38 x_html = etree.HTML(html) 39 txt = [] 40 41 txt = x_html.xpath('//div[@id="cnblogs_post_body"]//p//text()') 42 return txt 43 def save_text(self,name,txt):#保存成txt文件,txt是文章内容 44 print(name + "loading...") 45 with open("C:/Users/25766/Desktop/sa/" + name + '.txt','w',encoding='utf-8') as f:#文件路径 46 for i in txt: 47 f.write(i) 48 f.write(' ') 49 f.close() 50 print("finsh") 51 surl = input("输入爬取博客的首页url:") 52 c = blog(surl) 53 for i in range(1,200):#获取其各个分页,通过规律发现 54 url = c.get_url2(str(i)) 55 ls = c.get_html(url) 56 #print(ls) 57 time.sleep(2) 58 names, urls = c.get_url1(ls) 59 for name, url in zip(names,urls): 60 #print(name,":", url) 61 html = c.get_html(url) 62 txt = c.get_text(html) 63 c.save_text(name[:5],txt)#以前5个字符命名 64 65 66 67