这是一个通过使用requests和BeautifulSoup库,简单爬取网站的所有超链接的小爬虫。有任何问题欢迎留言讨论。
import requests from bs4 import BeautifulSoup def getHTMLText(url): ''' 此函数用于获取网页的html文档 ''' try: #获取服务器的响应内容,并设置最大请求时间为6秒 res = requests.get(url, timeout = 6) #判断返回状态码是否为200 res.raise_for_status() #设置该html文档可能的编码 res.encoding = res.apparent_encoding #返回网页HTML代码 return res.text except: return '产生异常' def main(): ''' 主函数 ''' #目标网页,这个可以换成一个你喜欢的网站 url = 'https://www.cnblogs.com/huwt/' demo = getHTMLText(url) #解析HTML代码 soup = BeautifulSoup(demo, 'html.parser') #模糊搜索HTML代码的所有包含href属性的<a>标签 a_labels = soup.find_all('a', attrs={'href': True}) #获取所有<a>标签中的href对应的值,即超链接 for a in a_labels: print(a.get('href')) main()
测试结果:
https://www.cnblogs.com/huwt/ https://www.cnblogs.com/huwt/ https://www.cnblogs.com/ https://www.cnblogs.com/huwt/ https://i.cnblogs.com/EditPosts.aspx?opt=1 https://msg.cnblogs.com/send/%E8%B7%AF%E6%BC%AB%E6%BC%AB%E6%88%91%E4%B8%8D%E7%95%8F https://www.cnblogs.com/huwt/rss https://i.cnblogs.com/ https://www.cnblogs.com/huwt/archive/2019/04/10.html https://www.cnblogs.com/huwt/p/10680209.html https://www.cnblogs.com/huwt/p/10680209.html https://i.cnblogs.com/EditPosts.aspx?postid=10680209 https://www.cnblogs.com/huwt/p/10685968.html https://www.cnblogs.com/huwt/p/10685968.html https://i.cnblogs.com/EditPosts.aspx?postid=10685968 https://www.cnblogs.com/huwt/archive/2019/04/08.html https://www.cnblogs.com/huwt/p/10673470.html https://www.cnblogs.com/huwt/p/10673470.html https://i.cnblogs.com/EditPosts.aspx?postid=10673470 https://www.cnblogs.com/huwt/archive/2019/03/31.html https://www.cnblogs.com/huwt/p/10633896.html https://www.cnblogs.com/huwt/p/10633896.html https://i.cnblogs.com/EditPosts.aspx?postid=10633896 https://www.cnblogs.com/huwt/p/10632084.html https://www.cnblogs.com/huwt/p/10632084.html https://i.cnblogs.com/EditPosts.aspx?postid=10632084 https://www.cnblogs.com/huwt/archive/2019/03/30.html https://www.cnblogs.com/huwt/p/10629625.html https://www.cnblogs.com/huwt/p/10629625.html https://i.cnblogs.com/EditPosts.aspx?postid=10629625 https://www.cnblogs.com/huwt/archive/2019/03/25.html https://www.cnblogs.com/huwt/p/10597502.html https://www.cnblogs.com/huwt/p/10597502.html https://i.cnblogs.com/EditPosts.aspx?postid=10597502 https://www.cnblogs.com/huwt/archive/2019/03/24.html https://www.cnblogs.com/huwt/p/10591353.html https://www.cnblogs.com/huwt/p/10591353.html https://i.cnblogs.com/EditPosts.aspx?postid=10591353 https://www.cnblogs.com/huwt/archive/2019/03/16.html https://www.cnblogs.com/huwt/p/10540942.html https://www.cnblogs.com/huwt/p/10540942.html https://i.cnblogs.com/EditPosts.aspx?postid=10540942 https://www.cnblogs.com/huwt/p/10541675.html https://www.cnblogs.com/huwt/p/10541675.html https://i.cnblogs.com/EditPosts.aspx?postid=10541675 https://www.cnblogs.com/huwt/default.html?page=2 [Finished in 1.1s]