Python3.x:定时获取页面数据存入数据库
#间隔五分钟采集一次数据入库 import pymysql import urllib.request from bs4 import BeautifulSoup import threading import time # 数据入库处理 def doDataWlpc(jjdm, jjmc, dwjz, dwjzrq): r_code = 0 print('基金信息:' + jjdm + ',' + jjmc + ',' + dwjz + ',' + dwjzrq) try: # 打开数据库连接 conn = pymysql.connect(host='localhost', user='root', passwd='lizm', db='pythondb', port=3306, charset='utf8') # 获取一个游标 cursor = conn.cursor() # 查询数据是否已经存在 sql_check = """SELECT * FROM pythondb.t_x01_wlpc WHERE dwjz='""" + dwjz + """' and dwjzrq='""" + dwjzrq + """';""" print('sql_check>>>:' + sql_check) cursor.execute(sql_check) results = cursor.fetchall() # 判断是否有记录数 if len(results) == 0: check_code = 0 else: check_code = 1 if check_code == 0: sql = """INSERT INTO pythondb.t_x01_wlpc (jjdm,jjmc,dwjz,dwjzrq,oprdate) VALUES('""" + jjdm + """','""" + jjmc + """','""" + dwjz + """','""" + dwjzrq + """',sysdate());""" try: print('sql>>>:' + sql) # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 conn.commit() r_code = 0 except: # 如果发生错误则回滚 conn.rollback() r_code = 1 else: r_code = 0 print('基金' + jjmc + '数据已存在') cursor.close() # 关闭游标 conn.close() # 释放数据库资源 except: r_code = 1 print("失败,异常") return r_code # 获取基金的信息 def getJjInfor(header_, url_): # 返回数组 r_info = [] req = urllib.request.Request(url=url_, headers=header_) res = urllib.request.urlopen(req) html = res.read().decode('utf-8') soup = BeautifulSoup(html, 'html.parser') # css的class获取值 jjdm = soup.find( 'div', class_='fundDetail-tit').find('span', class_='ui-num') r_info.append(jjdm.get_text()) #print('基金代码:' + jjdm.get_text()) title_name = soup.find('div', class_='fundDetail-tit') r_info.append(title_name.text.split('(')[0]) #print('基金名称:' + title_name.text.split('(')[0]) # 获取估算净值、单位净值、累计净值 for dataNums in soup.find_all('dd', class_='dataNums'): for jzs_ in dataNums.find_all('span', class_='ui-font-large ui-color-red ui-num'): r_info.append(jzs_.text) #print('' + jzs_.text) gz_gztime = soup.find(id='gz_gztime') r_info.append(gz_gztime.text.replace('(', '').replace(')', '')) #print('估算净值日期:' + gz_gztime.text.replace('(', '').replace(')', '')) # 输出class为'dataItem02'标签里面的第一个p元素 dwjzrq_s = soup.find('dl', class_='dataItem02').p r_info.append(dwjzrq_s.text.split('(')[1].split(')')[0]) #print('单位净值日期:' + dwjzrq_s.text.split('(')[1].split(')')[0]) return r_info def main(): global timer url = r'http://fund.eastmoney.com/340007.html?spm=search' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} jj_infor = [] jj_infor = getJjInfor(headers, url) print(jj_infor) return_code = doDataWlpc(jj_infor[0], jj_infor[1], jj_infor[3], jj_infor[5]) if return_code ==0: print('执行成功') else: print('执行失败') #重复构造定时器 timer = threading.Timer(5*60,main) timer.start() # 测试 if __name__ == '__main__': #定时调度 timer = threading.Timer(1,main) timer.start()
稍后再加上指定时间段内执行;