目标:爬取vamei的python快速教程
代码如下:
from bs4 import BeautifulSoup
import requests
def get_links(): # 获取要爬去打所有页面的链接
links = []
url = 'http://www.cnblogs.com/vamei/archive/2012/09/13/2682778.html'
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
titles = soup.select('#cnblogs_post_body > p > span > a')
for title in titles:
links.append(title.get('href'))
return links
def get_content(): # 爬取博客正文部分内容
links = get_links()
for link in links:
web_data = requests.get(link)
soup = BeautifulSoup(web_data.text, 'lxml')
contents = soup.select('#topics > div')
for content in contents:
print(content.get_text())
get_content()
接下来可以考虑把爬取的内容保存到本地文件