简单爬取汽车之家新闻(requests模块+bs4)
import requests
ret = requests.get('https://www.autohome.com.cn/all/#pvareaid=3311230')
ret.encoding = 'gb2312'
# print(ret.text)
from bs4 import BeautifulSoup
soup = BeautifulSoup(ret.text, 'lxml')
li_list = soup.find_all(name='li')
# print(li_list)
for li in li_list:
h3 = li.find(name='h3')
# print(h3)
if not h3:
continue
# 标题
title = h3.text
# print(title)
# 描述
desc = li.find(name='p').text
# print(desc)
# 图片
img = li.find(name='img')['src'] # type:str
# print(img)
# 链接
url = li.find(name='a')['href']
# print(url)
# 图片下载到本地
ret_imgs = requests.get('https:' + img)
img_name = img.rsplit('/', 1)[-1]
with open('bs4_img/' + img_name, 'wb') as f:
for line in ret_imgs.iter_content():
f.write(line)
print('''
标题:%s
摘要:%s
图片:%s
链接:%s
''' % (title, desc, img, url))