最近在看mongodb,然后会用了一些最简单的mongodb的操作,然后想着结合股票信息的数据的抓取,然后将数据存储在mongodb中,对于mongo和数据库的最大的区别是,mongo不需要建表,直接进行存储,然后在选择数据表的时候在进行插入数据的时候要将str格式的字符串转换成json的格式进行插入,这个我在插入数据的时候调试了十多分钟,一直以为是自己字符串的原因,然后看了看插入数据的格式和百度,然后才发现这点。然后我是插入在本机的test.Share表中的,然后其他的注重点就没有什么了~代码写的很丑,冗余也很大,还是会继续更新~并且程序是但进程进行的数据抓取~嗯~ 很蠢~
1 #-*-coding:utf-8 -*- 2 import urllib 3 import re 4 import json 5 import urllib2 6 from lxml import etree 7 import requests 8 import time 9 from Queue import Queue 10 from pymongo import MongoClient 11 import matplotlib.pyplot as plt 12 URL = 'http://quote.fx678.com/exchange/WH' 13 nation_que = Queue() 14 client = MongoClient('localhost',27017) 15 db = client.test 16 Share = db.Share 17 18 def sub_sort(array,array1,low,high): 19 key = array[low] 20 key1 = array1[low] 21 while low < high: 22 while low < high and array[high] >= key: 23 high -= 1 24 while low < high and array[high] < key: 25 array[low] = array[high] 26 array1[low] = array1[high] 27 low += 1 28 array[high] = array[low] 29 array1[high] = array1[low] 30 array[low] = key 31 array1[low] = key1 32 return low 33 34 35 def quick_sort(array,array1,low,high): 36 if low < high: 37 key_index = sub_sort(array,array1,low,high) 38 quick_sort(array,array1,low,key_index) 39 quick_sort(array,array1,key_index+1,high) 40 41 def download(url, headers, num_try=2): 42 while num_try >0: 43 num_try -= 1 44 try: 45 content = requests.get(url, headers=headers) 46 return content.text 47 48 except urllib2.URLError as e: 49 print 'Download error', e.reason 50 51 return None 52 53 current_quto = Queue() 54 open_quto = Queue() 55 high_quto = Queue() 56 low_quto = Queue() 57 close_quto = Queue() 58 update_time = Queue() 59 def get_type_url(): 60 headers = { 61 'User_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 62 'Referer': 'http://quote.fx678.com/exchange/WH', 63 'Cookie': 'io=-voMclEjiizK9nWKALqB; UM_distinctid=15f5938ddc72db-089cf9ba58d9e5-31657c00-fa000-15f5938ddc8b24; Hm_lvt_d25bd1db5bca2537d34deae7edca67d3=1509030420; Hm_lpvt_d25bd1db5bca2537d34deae7edca67d3=1509031023', 64 'Accept-Language': 'zh-CN,zh;q=0.8', 65 'Accept-Encoding': 'gzip, deflate', 66 'Accept': '*/*' 67 } 68 content = download(URL,headers) 69 html = etree.HTML(content) 70 result = html.xpath('//a[@class="mar_name"]/@href') 71 result1 = html.xpath('//td/text()') 72 num = 0 73 for each in result1: 74 75 if num%6 == 0: 76 current_quto.put(each) 77 num += 1 78 elif num%6 == 1: 79 open_quto.put(each) 80 num += 1 81 elif num%6 == 2: 82 high_quto.put(each) 83 num += 1 84 elif num%6 == 3: 85 low_quto.put(each) 86 num += 1 87 elif num %6 == 4: 88 close_quto.put(each) 89 num +=1 90 elif num %6 == 5: 91 update_time.put(each) 92 num +=1 93 #while not 94 for each in result: 95 st = each.split('/') 96 nation_que.put(st[len(st)-1]) 97 98 get_precent() 99 100 def get_precent(): 101 102 while not nation_que.empty(): 103 if not update_time.empty(): 104 time_update = update_time.get(False) 105 update_time.task_done() 106 if not current_quto.empty(): 107 new_rates = current_quto.get(False) 108 current_quto.task_done() 109 if not open_quto.empty(): 110 opening = open_quto.get(False) 111 open_quto.task_done() 112 if not high_quto.empty(): 113 high = high_quto.get(False) 114 high_quto.task_done() 115 if not low_quto.empty(): 116 low = low_quto.get(False) 117 low_quto.task_done() 118 if not close_quto.empty(): 119 closing = close_quto.get(False) 120 close_quto.task_done() 121 122 ss = nation_que.get(False) 123 print ss 124 print low 125 print high 126 print time_update 127 print new_rates 128 print opening 129 130 url = 'http://api.q.fx678.com/history.php?symbol=' + ss +'&limit=288&resolution=5&codeType=8100&st=0.8274405615006541' 131 print url 132 headers = {'Accept':'application/json, text/javascript, */*; q=0.01', 133 'Accept-Encoding':'gzip, deflate', 134 'Accept-Language':'zh-CN,zh;q=0.8', 135 'Connection':'keep-alive', 136 'Host':'api.q.fx678.com', 137 'Origin':'http://quote.fx678.com', 138 'Referer':'http://quote.fx678.com/symbol/USD', 139 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 140 } 141 num_try = 2 142 while num_try >0: 143 num_try -= 1 144 try: 145 content = requests.get(url, headers=headers) 146 html = json.loads(content.text) 147 st = html['h'] 148 T_time = html['t'] 149 if len(st) > 0 and len(T_time) > 0: 150 draw_pict(ss,T_time,st,time_update,new_rates,opening,high,low,closing) 151 break 152 except urllib2.URLError as e: 153 print 'Download error', e.reason 154 nation_que.task_done() 155 List = [] 156 def draw_pict(name,T_time1,high_rate,time_update,new_rate,opening,high,low,closing): 157 158 High = T_time1 159 Time = high_rate 160 High_Rate = [] 161 T_time = [] 162 mmap = "{"Type":"%s","Current_quto":"%s","Opening_quto":"%s","High_quto":"%s","low_quto":"%s","Closing_quto":"%s","Update_Time":"%s","Real_TIme_infor":{" % ( name, new_rate, opening, high, low, closing, time_update) 163 print mmap 164 flag = 0 165 for each,high1 in zip(T_time1,high_rate): 166 if flag == 1: 167 mmap += "," 168 else: 169 flag = 1 170 mm = ""%s":"%s""%(each,high1) 171 172 173 st = time.localtime(float(each)) 174 mmap += mm 175 if st.tm_min == 0: 176 T_time.append(st.tm_hour) 177 High_Rate.append(high1) 178 else: 179 pass 180 mmap += "}}" 181 mmap1 = json.loads(mmap) 182 print mmap1 183 Share.insert(mmap1) 184 if len(T_time) == len(High_Rate): 185 quick_sort(T_time,High_Rate,0,len(High_Rate)-1) 186 List.append(High_Rate) 187 188 def draw_picture(): 189 colu = len(List) 190 191 num = 1 192 for each in List: 193 plt.subplot(colu/2 + 1,2,num) 194 num+=1 195 196 list = each 197 T_time = [] 198 for i in range(len(list)): 199 T_time.append(i) 200 print len(list) 201 print len(T_time) 202 plt.plot(T_time, list, marker='*') 203 204 plt.show() 205 plt.title('Share Message') 206 207 if __name__ == '__main__': 208 get_type_url() 209 draw_picture()