需求:
https://intrinio.com/tutorial/web_api
我们通过上述网站提供的API获取了中国股市某支股票csv数据文件,现在要下载多只股票的csv数据,并将其转换为xml文件,如何使用线程来提高下载并处理的效率?
思路:通过python中的thread来下载
代码:
import csv
from xml.etree.ElemenTree import Element, ElementTree
import requests
from StringIO import StringIO
from xml_pretty import pretty
def download(url):
response = requests.get(url,timeout=3)
if response.ok:
return StringIO(response.content)
def csvToxml(scsv,fxml):
reader = csv.reader(scsv)
header = reader.next()
headers = map(lambda h: h.replace( , ),headers)
root = Element("Data")
for row in reader:
eRow = Element("Row")
root.append(eRow)
for tag,text in zip(headers,row):
e = Element(tag)
e.text = text
eRow.append(e)
pretty(root)
et = ElementTree(root)
et.write(fxml)
def handle(sid):
print('Download...(%d)' % sid)
url = 'http://table.finance.yahoo.com/table.csv?s=%s.sz'
url %= str(sid).rjust(6,'0')
rf = download(url)
if rf is None:return
print('convert to xml...(%d)' % sid)
fname = str(sid).rjust(6,'0') +'.xml'
with open(fname,'wb') as wf:
csvToxml(rf,wf)
# 方法一
from threading import Thread
t = Thread(target=handle,args=(1,)) # 创建一个线程对象,并处理第一支股票
t.start # 执行线程
# 方法二
class MyThread(Thread):
def __init__(self,sid):
Thread.__init__(self) # 调用父类的构造器
self.sid = sid
def run(self):
handle(self.sid)
threads = []
for i in xrange(1,11):
t = MyThread(i)
threads.append(t)
t.start()
for t in threads:
t.join() # 阻塞函数等待子线程的退出,如果run函数没有执行完主线程函数不会退出,即下面没有打印
print('main thread')
# io型操作,相当于超市订货,例如上面的download操作
# cpu型操作,相当于超市货物搬运,例如csv转换xml文件
# 在python中不适合于用cpu密集型操作,原因是global interpreter lock,全局解释器锁,python中的线程只适合处理io型的操作
if __name__ == '__main__':
url = 'http://table.finance.yahoo.com/table.csv?s=000001.sz'
rf = download(url)
if rf:
with open('000001.xml',wb) as wf:
csvToxml(rf,wf)
=================================================================
import requests
import base64
from io import StringIO
import csv
from xml.etree.ElementTree import ElementTree, Element, SubElement
apikey = 'OjZlY2MzYTQwNGVlMTI3Y2VkYjMyYTZiNzJiYzdlOTFk'
def download_csv(page_number):
print('download csv data [page=%s]' % page_number)
url = "https://api.intrinio.com/prices.csv?api_key=OjZlY2MzYTQwNGVlMTI3Y2VkYjMyYTZiNzJiYzdlOTFk&identifier=AAPL&page_size=20&page_number=%s&start_date=2017-09-28&end_date=2020-09-28" % page_number
# auth = b'Basic ' + base64.b64encode(b'%s' % api_key)
# headers = {'Authorization' : auth}
response = requests.get(url)
if response.ok:
return StringIO(response.text)
def csv_to_xml(csv_file, xml_path):
print('Convert csv data to %s' % xml_path)
reader = csv.reader(csv_file)
headers = next(reader)
root = Element('Data')
root.text = '
'
root.tail = '
'
for row in reader:
book = SubElement(root, 'Row')
book.text = '
'
book.tail = '
'
for tag, text in zip(headers, row):
e = SubElement(book, tag)
e.text = text
e.tail = '
'
e.tail = '
'
ElementTree(root).write(xml_path, encoding='utf8')
def download_and_save(page_number, xml_path):
# IO
csv_file = None
while not csv_file:
csv_file = download_csv(page_number)
# CPU
csv_to_xml(csv_file, 'data%s.xml' % page_number)
from threading import Thread
class MyThread(Thread):
def __init__(self, page_number, xml_path):
super().__init__()
self.page_number = page_number
self.xml_path = xml_path
def run(self):
download_and_save(self.page_number, self.xml_path)
if __name__ == '__main__':
import time
t0 = time.time()
thread_list = []
for i in range(1, 6):
t = MyThread(i, 'data%s.xml' % i)
t.start()
thread_list.append(t)
for t in thread_list:
t.join()
# for i in range(1, 6):
# download_and_save(i, 'data%s.xml' % i)
print(time.time() - t0)
print('main thread end.')