本文抽取证券之星基金的一个小表格为例(xpath的使用自己看菜鸟驿站)
import requests
from lxml import etree
import csv
def gethtml(url,headers): #获取网页text
try:
req = requests.get(url)
req.raise_for_status()
req.encoding = 'gb18030'
html = req.text
return html
except:
print('Error')
def getcontent(html): #获取所要的表格下的所有内容,但不包括标题
html = etree.HTML(html)
name = html.xpath('//*[@id="datalist"]/tr//text()')
# tobady = name[0]
td = []
for i in range(len(name)):
td.append(name[i])
td1 = [td[i:i+8] for i in range(0,len(td),8)] #此处按8个一行切分,便于后面csv储存
return td1
def save_data(fname,td1): #保存成csv文件
f = open(fname, 'w', encoding='gb18030', newline="")
writer = csv.writer(f)
writer.writerow(('基金代码', '基金名称', '单位净值', '累计净值', '日增长额', '日增长率', '申购', '赎回'))
for i in td1:
writer.writerow(i)
def main():
url = 'http://quote.stockstar.com/fund/mixed.shtml'
fname = 'E:/shuju/t.csv'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
save_data(fname,getcontent(gethtml(url,headers=headers)))
if __name__ == '__main__':
main()