# import os import requests from bs4 import BeautifulSoup # 登陆, 模仿用户浏览器 r1 = requests.get( # 要爬取的博客圆页面 url='https://zzk.cnblogs.com/s/blogpost?Keywords=blog%3aaronthon%201&pageindex=9', # 浏览器的信息 headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } ) # 编码 # r1.encoding = 'gbk' # 获取第一条cookie r1_cookie_dict = r1.cookies.get_dict() # 去响应体中j解析我们想要的数据 soup = BeautifulSoup(r1.text, 'html.parser') container = soup.find(name='div', attrs={'class':'forflow'}) div_list = container.find_all(name='div',attrs={'class':'searchItem'}) # 循环所有的标签 for tag in div_list: articles = tag.find(name='a', attrs={'target': '_blank'}) if not articles: continue summay = tag.find(name='span', attrs={'class': 'searchCon'}) if not summay: continue print(articles.text) print('文章地址:', articles['href']) print('------------------------------------------------') # 把爬取的博客标题和博客地址写到本地文件中 articles_title = articles.text try: r3 = articles_title with open("article.txt", 'a+') as f: f.write(r3) f.write(articles['href']) f.write(" ") except: print('爬取失败')