开始想着用IronPython库在C#里面直接执行python 方法 发现导包很多时候喜欢报错。到时候我用python 做一个web服务 直接调用接口
开始爬取博客园数据
爬博客园很简单 都是静态数据
思路。
1 爬取随笔分类 或许URL
2.逐个爬取分类。获取列表 url
3.爬取文章详情,下载图片
4,替换文章详情图片连接
上代码
import requests import os from pyquery import PyQuery as pq def Request(url,data=""): herder={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"} req=requests.get(url,headers=herder,params=data) if req.status_code==200: return req.text else: return 0 def disfeilei(html): doc=pq(html) listfls=doc('#sidebar_postcategory').find('li a').items() list=[] for fl in listfls: classfly={ 'url':fl.attr('href'), 'name':fl.text().split('(')[0], 'count':fl.text().split('(')[1][0:-1] } list.append(classfly) return list def Getwzcon(url): html=Request(url) doc=pq(html) con=doc('#main').find('#cnblogs_post_body') imglist=con.find("img").items() for i in imglist: url= i.attr('src') index=url.find('797834')+7 flit=url[index:] #保存到项目文件 path='h:/。net学习/blogs/BLOGS/WebApplication1/images/blogs/'+flit #dowimg(url,path) #替换图片路径 i.attr('src',path) print(type(con.html())) return con.html() def dowimg(url,path): #获取目录 paths=os.path.dirname(path) print(paths) #目录是否存在 if os.path.exists(paths)==False: os.makedirs(paths) response = requests.get(url).content with open(path,'wb')as f: f.write(response) print("文件下载成功") def Getwenz(classfly): html=Request(classfly) doc=pq(html) listwzs=doc('#main').find('.entrylist>.entrylistItem').items() list=[] for i in listwzs: title=i.find('.entrylistItemTitle').text() url=i.find('.entrylistItemTitle').attr('href') desc=i.find('.c_b_p_desc').text()[0:-4] entry=i.find('.entrylistItemPostDesc').text().split(" ") datatime=entry[2]+" "+entry[3] readcount=entry[5][3:-1] #获取详情内容 content=Getwzcon(url) #print(entry) art={ 'title':title, 'url':url, 'desc':desc, 'datatime':datatime, 'readcount':readcount, 'body':content } print(art) return if __name__ == '__main__': url="https://www.cnblogs.com/ruogu/mvc/blog/sidecolumn.aspx" data1={'blogApp':'ruogu' } textfeilei=Request(url,data1) if textfeilei!=0: #获取所有分类 list_fly=disfeilei(textfeilei) #遍历分类 for item in list_fly: #添加数据库 #print(item) #获取文章详情 Getwenz(item['url'])