# import os import requests from bs4 import BeautifulSoup r1 = requests.get( url='http://jandan.net/', # 浏览器的信息 headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } ) # r1_cookie_dict = r1.cookies.get_dict() # 去响应体中j解析我们想要的数据 soup = BeautifulSoup(r1.text, 'html.parser') # 按照规则找名字: div标签且id=content 找到匹配的第一个 container = soup.find(name='div', attrs={'id':'content'}) # 去container找到所有的class=post f list-post的div标签 div_list = container.find_all(name='div',attrs={'class':'post f list-post'}) # 循环所有的标签 for tag in div_list: articles = tag.find(name='h2') # 如果为空就跳过 if not articles: continue # 找到class='indexs'的所有div标签 summay = tag.find(name='div', attrs={'class': 'indexs'}) if not summay: continue img_addr = tag.find(name='img') if not img_addr: continue print('标题',articles.text) print('简介------------------------------------',summay.text) print(img_addr) print(img_addr.get('data-original')) print('----------------------------------------')