爬取股票数据
1 爬取前的分析
技术路线:使用 requests + bs4 + re
数据网站选取原则:股票信息静态存在于 HTML 页面中,非 js 代码生成没有 Robots 协议限制
获取股票列表:
东方财富网:http://quote.eastmoney.com/stock_list.html
获取个股信息:
百度股票:https://gupiao.baidu.com/stock/
单个股票:https://gupiao.baidu.com/stock/szee2439.html
2 爬取流程
步骤1:从东方财富网获取股票列表
步骤2:根据股票列表逐个到百度股票获取个股信息
步骤3:将结果存储到文件
# -*- coding: utf-8 -*-
# @Time : 2019/8/30 15:39
# @Author : banshaohuan
# @Site :
# @File : gupiao_baidu_youhua.py
# @Software: PyCharm
import requests
from bs4 import BeautifulSoup
import re
def getHTMLText(url, code="utf-8"):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = code
return r.text
except:
return ""
def get_stock_list(lst, stock_url):
html = getHTMLText(stock_url, "GB2312")
soup = BeautifulSoup(html, 'html.parser')
a = soup.find_all('a')
for i in a:
try:
href = i.attrs['href']
lst.append(re.findall(r"[s][hz]d{6}", href)[0])
except:
continue
def get_stock_info(lst, stock_url, file_path):
count = 0
for stock in lst:
url = stock_url + stock + '.html'
html = getHTMLText(url)
try:
if html == "":
continue
info_dict = {}
soup = BeautifulSoup(html, 'html.parser')
stock_info = soup.find('div', attrs={"class":"stock-bets"})
name = stock_info.find_all(attrs={"class":"bets-name"})[0]
info_dict.update({'股票名称': name.text.split()[0]})
key_list = stock_info.find_all('dt')
value_list = stock_info.find_all('dd')
for i in range(len(key_list)):
key = key_list[i].text
val = value_list[i].text
info_dict[key] = val
with open(file_path, 'a', encoding='utf-8') as f:
f.write(str(info_dict) + '
')
count += 1
print("
当前进度:{:.2f}%".format(count*100/len(lst)), end="")
except:
count += 1
print("
当前进度:{:.2f}%".format(count * 100 / len(lst)), end="")
continue
def main():
stock_list_url = 'https://quote.eastmoney.com/stock_list.html'
stock_info_url = 'https://gupiao.baidu.com/stock/'
output_file = 'D:/BaiduStockInfo.txt'
stock_list = []
get_stock_list(stock_list, stock_list_url)
get_stock_info(stock_list, stock_info_url, output_file)
if __name__ == '__main__':
main()
本文参考嵩天老师在中国 MOOC 大学的课程《python 网络爬虫与信息提取》