• 江河数据(一)


    1 说明

    这些数据其实早在2015年底就已经收集,并处理。

    1.1 数据说明

    数据来源于全国水雨晴信息网站,html表格的形式作为原始的存储。

    enter description here

    1.2 处理与展示

    Mysql数据存储,每月一个Table;通过这个一个table数据又处理为Excel的表格。

    enter description here

    enter description here

    2 写Mysql数据库

    2.1 读html写入数据库

    以一个月为单位,将这一个月的html数据转存到Mysql的一张表中。

    # -*- encoding: utf8 -*-
    '''
    Created on 2015年12月8日
    @author: baoluo
    '''
    from bs4 import BeautifulSoup
    import time
    import MySQLdb
    import sys
    import re
    import os
    import glob
    
    reload(sys)
    sys.setdefaultencoding( 'utf-8' )
    
    def checkDir():
        if not os.path.isdir('./htmls'):
            os.mkdir("./htmls")
    
    
    def tn():
        t = time.strftime("%Y%m%d", time.localtime())
        #print t,type(t)
        return t
    
    def _w_log(fn):
        #print fn +  '  write log.txt'
        fp = open('log.txt','a+')
        fp.write(time.strftime("%Y-%m-%d %H:%M:%S  ", time.localtime()) + fn + '
    ')
        fp.close()
    
    def table(fn, tp):
        fp = open(fn,'r')
        html =fp.read()
        fp.close()
        strhtml = html.replace(' ','')
    
        date =''
        re1 = r'd{4}-d{2}-d{2}'
        rg=re.compile(re1)
        m = rg.findall(strhtml[:7777])
        if m:
            date = m[0]
        #print date
        dbname = date.split('-')[0]+date.split('-')[1]
        sql = ''
        if tp=='hd':
            dbname = 'big_rivers'+dbname
        else:
            dbname = 'reservoir'+ dbname
    
        #数据库操作
        conn= MySQLdb.connect(
                host='localhost',
                port = 3306,
                user='root',
                passwd='root',
                charset='utf8'
                )
        cur = conn.cursor()
        conn.select_db('information_schema')
        #print 'dbname :->'+dbname
        fl = cur.execute("SELECT * FROM tables WHERE table_name='"+dbname+"';")
        conn.select_db('Grab_Hydrology_Data')
        if 0 == fl :
            print 'Creat new database:' + dbname
            if 'big_rivers' in dbname:
                creatsql = "CREATE TABLE " + dbname + """
                    (id  VARCHAR(20)  primary key,
                    流域  VARCHAR(20),
                    行政区 VARCHAR(30),
                    河名 VARCHAR(30),
                    站名 VARCHAR(30),
                    时间 VARCHAR(30),
                    水位 VARCHAR(10),
                    流量 VARCHAR(10),
                    警戒水位 VARCHAR(10))"""
            else:
                creatsql = "CREATE TABLE " + dbname + """
                    (id  VARCHAR(20)  primary key,
                    流域  VARCHAR(20),
                    行政区 VARCHAR(30),
                    河名 VARCHAR(30),
                    库名 VARCHAR(30),
                    库水位 VARCHAR(10),
                    蓄水量 VARCHAR(10),
                    入库 VARCHAR(10),
                    提顶高程 VARCHAR(10))"""
            #print creatsql
            cur.execute(creatsql)
    
        soup = BeautifulSoup(strhtml,"html.parser")
        trs= soup.findAll('tr')
        i=0
        #print 'trs',trs
        for tr in trs:
            i += 1
            param=[]
            sp = BeautifulSoup(str(tr),"html.parser")
            tds= sp.findAll('td')
            #print i,'tds',tds
            for td in tds:
                td = str(td)
                #print i,'td ',td
                if 'style' not in td:
                    begin=td.index('">')+len('">')
                    end=td.index('</')
                    param.append(td[begin:end])
                else:
                    #print td
                    try:
                        td = td.split(')">')[1]
                        param.append(td.split('<')[0])
                    except IndexError:
                        td = td.split(');">')[1]
                        param.append(td.split('<')[0])
    
            idname = "'"+date + ',' + str(i)+"',"
            vs = ''
            #print i,param
    
            for v in param:
                v = "'" + v.strip() + "'"
                vs += v + ','
            vs = idname + vs[:-1] +')'
            '''
            print param[0].decode('utf8').encode('GBK'),
            param[1].decode('utf8').encode('GBK'),
            param[2].decode('utf8').encode('GBK'),
            param[3].decode('utf8').encode('GBK'),
            param[4],param[5],param[6],param[7]
            '''
            try:
                cur.execute('INSERT INTO '+ dbname + ' VALUES (' + vs)
                conn.commit() # 提交到数据库执行
            except MySQLdb.Error,e:
                print "Mysql Error %d: %s" % (e.args[0], e.args[1])
                if 1062==e.args[0]:
                    break
                elif 1064==e.args[0]:
                    print vs.decode('utf8').encode('GBK')
                    break
                else:
                    break
                #print 'Already exists:' + vs.decode('utf8').encode('GBK')
                #_w_log(fn + "  except")
                conn.rollback() # 发生错误时回滚
    
        cur.close()
        conn.close()
    
    
    if __name__ == '__main__':
        header = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:28.0) Gecko/20100101 Firefox/28.0'}
        #checkDir()
        files = glob.glob('./htmdata/htmls-counter/2016/201610*.htm')
        print len(files),"个文件",time.strftime("%Y-%m-%d  %H:%M:%S ", time.localtime())
        for f in files:
            if  ( '江河'  in f.decode('gbk') or 'big_rivers' in f):
                print f.decode('gbk'),time.strftime("%Y-%m-%d  %H:%M:%S ", time.localtime())
                table(f,'hd')
                print time.strftime("%Y-%m-%d  %H:%M:%S ", time.localtime())
    
            elif '水库' or 'reservoir' in f.decode('gbk'):
                print f.decode('gbk'),time.strftime("%Y-%m-%d  %H:%M:%S ", time.localtime())
                table(f,'sk')
                print time.strftime("%Y-%m-%d  %H:%M:%S ", time.localtime())
    
    

    2.2 运行结果

    (从运行时间上看简直无法直视.....)

    59 个文件 2016-11-01  10:04:12 
    ./htmdata/htmls-counter/201620161001水库数据.htm
    Creat new database:reservoir201610
    ./htmdata/htmls-counter/201620161001江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-01,1' for key 'PRIMARY'
    2016-11-01  10:04:26 
    ./htmdata/htmls-counter/201620161002水库数据.htm
    ./htmdata/htmls-counter/201620161002江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-02,1' for key 'PRIMARY'
    2016-11-01  10:05:00 
    ./htmdata/htmls-counter/201620161003水库数据.htm
    ./htmdata/htmls-counter/201620161003江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-03,1' for key 'PRIMARY'
    2016-11-01  10:05:36 
    ./htmdata/htmls-counter/201620161004水库数据.htm
    ./htmdata/htmls-counter/201620161004江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-04,1' for key 'PRIMARY'
    2016-11-01  10:06:05 
    ./htmdata/htmls-counter/201620161005水库数据.htm
    ./htmdata/htmls-counter/201620161005江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-05,1' for key 'PRIMARY'
    2016-11-01  10:06:37 
    ./htmdata/htmls-counter/201620161006水库数据.htm
    ./htmdata/htmls-counter/201620161006江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-06,1' for key 'PRIMARY'
    2016-11-01  10:07:08 
    ./htmdata/htmls-counter/201620161007水库数据.htm
    ./htmdata/htmls-counter/201620161007江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-07,1' for key 'PRIMARY'
    2016-11-01  10:07:37 
    ./htmdata/htmls-counter/201620161009水库数据.htm
    ./htmdata/htmls-counter/201620161009江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-09,1' for key 'PRIMARY'
    2016-11-01  10:08:12 
    ./htmdata/htmls-counter/201620161010水库数据.htm
    ./htmdata/htmls-counter/201620161010江河数据.htm
    2016-11-01  10:09:47 
    ./htmdata/htmls-counter/201620161011水库数据.htm
    ./htmdata/htmls-counter/201620161011江河数据.htm
    2016-11-01  10:11:39 
    ./htmdata/htmls-counter/201620161013水库数据.htm
    ./htmdata/htmls-counter/201620161013江河数据.htm
    2016-11-01  10:13:23 
    ./htmdata/htmls-counter/201620161013雨水情数据.htm
    Mysql Error 1136: Column count doesn't match value count at row 1
    ./htmdata/htmls-counter/201620161015水库数据.htm
    ./htmdata/htmls-counter/201620161015江河数据.htm
    2016-11-01  10:15:09 
    ./htmdata/htmls-counter/201620161015雨水情数据.htm
    Mysql Error 1136: Column count doesn't match value count at row 1
    ./htmdata/htmls-counter/201620161016水库数据.htm
    ./htmdata/htmls-counter/201620161016江河数据.htm
    2016-11-01  10:16:45 
    ./htmdata/htmls-counter/201620161016雨水情数据.htm
    Mysql Error 1136: Column count doesn't match value count at row 1
    ./htmdata/htmls-counter/201620161017水库数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-17,1' for key 'PRIMARY'
    ./htmdata/htmls-counter/201620161017江河数据.htm
    Mysql Error 1062: Duplicate entry '2016-10-17,1' for key 'PRIMARY'
    2016-11-01  10:16:48 
    ./htmdata/htmls-counter/201620161018水库数据.htm
    ./htmdata/htmls-counter/201620161018江河数据.htm
    2016-11-01  10:18:30 
    ./htmdata/htmls-counter/201620161019水库数据.htm
    ./htmdata/htmls-counter/201620161019江河数据.htm
    2016-11-01  10:19:58 
    ./htmdata/htmls-counter/201620161020水库数据.htm
    ./htmdata/htmls-counter/201620161020江河数据.htm
    2016-11-01  10:21:32 
    ./htmdata/htmls-counter/201620161021水库数据.htm
    ./htmdata/htmls-counter/201620161021江河数据.htm
    2016-11-01  10:23:46 
    ./htmdata/htmls-counter/201620161022水库数据.htm
    ./htmdata/htmls-counter/201620161022江河数据.htm
    2016-11-01  10:26:13 
    ./htmdata/htmls-counter/201620161023水库数据.htm
    ./htmdata/htmls-counter/201620161023江河数据.htm
    2016-11-01  10:28:09 
    ./htmdata/htmls-counter/201620161024水库数据.htm
    ./htmdata/htmls-counter/201620161024江河数据.htm
    2016-11-01  10:29:56 
    ./htmdata/htmls-counter/201620161025水库数据.htm
    ./htmdata/htmls-counter/201620161025江河数据.htm
    2016-11-01  10:31:34 
    ./htmdata/htmls-counter/201620161026水库数据.htm
    ./htmdata/htmls-counter/201620161026江河数据.htm
    2016-11-01  10:33:09 
    ./htmdata/htmls-counter/201620161027水库数据.htm
    ./htmdata/htmls-counter/201620161027江河数据.htm
    2016-11-01  10:34:45 
    ./htmdata/htmls-counter/201620161028水库数据.htm
    ./htmdata/htmls-counter/201620161028江河数据.htm
    2016-11-01  10:36:11 
    ./htmdata/htmls-counter/201620161029水库数据.htm
    ./htmdata/htmls-counter/201620161029江河数据.htm
    2016-11-01  10:37:43 
    ./htmdata/htmls-counter/201620161030水库数据.htm
    ./htmdata/htmls-counter/201620161030江河数据.htm
    2016-11-01  10:39:17 
    ./htmdata/htmls-counter/201620161031水库数据.htm
    ./htmdata/htmls-counter/201620161031江河数据.htm
    2016-11-01  10:40:58 
    [Finished in 2206.9s]
    

    3 写Excel文件

    # -*- encoding: utf-8 -*-
    
    from StringIO import StringIO
    import time
    from openpyxl.workbook import Workbook
    from openpyxl.reader.excel import load_workbook  
    from openpyxl.writer.excel import ExcelWriter
    import json
    import MySQLdb
    import os,sys
    #reload(sys)
    #sys.setdefaultencoding( 'utf-8' )
    
    '''
    JSON {"流域":{"站名":{"时间": [水位, 流量 , 警戒水位]},    ...
                 {"站名":{"时间", [水位, 流量 , 警戒水位]}  },    ...
          "黄河":{"站名":{ "时间": [水位, 流量 , 警戒水位]},    ...
                 {"站名":{"时间", [水位, 流量 , 警戒水位]}  },    ...
          "长江":{"站名":{ "时间": [水位, 流量 , 警戒水位]},    ...
                 {"站名":{"时间", [水位, 流量 , 警戒水位]}  },    ...
          }'''
    rjson ={}
    daterow = {}
    def loadJson(dic):
        io = StringIO(dic)
        io = json.dumps(str(io), sort_keys=True)
        return json.loads(io)
    
    def rDB(dbname):
        #数据库操作
        conn= MySQLdb.connect(
                host='localhost',
                port = 3306,
                user='root',
                passwd='root',
                charset='utf8'
                )
        cursor = conn.cursor()
        conn.select_db('information_schema')
        fl = cursor.execute("SELECT * FROM tables WHERE table_name='"+dbname+"';")
        conn.select_db('Grab_Hydrology_Data')
        selectsql = "SELECT 流域, 时间, 站名, 水位, 流量 , 警戒水位 FROM " + dbname
        if 0 == fl :
            print 'NO DB :' + dbname
            cursor.close()
            conn.close()
            return
        else:
            lens = cursor.execute(selectsql)
            results = cursor.fetchall()
            #print type(results),len(results),results[0]
            for nrow  in range(lens):
                #print(results[i][0])
                #设置流域
                rjson.setdefault(results[nrow][0],{})
                #设置站名
                rjson[results[nrow][0]].setdefault(results[nrow][2] ,{})
                #设置时间
                rjson[results[nrow][0]][results[nrow][2]].setdefault(results[nrow][1] ,
                     [results[nrow][3],results[nrow][4],results[nrow][5]])
                #绑定时间
                daterow.setdefault(results[nrow][1],0)
    
        cursor.close()
        conn.close()
    
    def pyxlWsheet(jss,dbname):
        dic = jss
        tn = time.strftime("%Y%m%d", time.localtime())
        fn = u'江河数据' + dbname + '.xlsx'
    
        sheet = Workbook()
        ew = ExcelWriter(workbook = sheet)
        ws = sheet.worksheets[0]
    
        ws.freeze_panes = ws['B4']
    
        ws.title = u'大江大河'
        ws.cell(row=3, column=1, value=u'时间')
    
        list =  sorted(daterow.iteritems(),key=lambda t:t[0],reverse=False)
        rowdate = {}
        #dbname = 'big_rivers201512'
        #rtn =  time.strftime("%Y", time.localtime())+'-'
        rtn = dbname[-6:-2]+'-'
    
        count_line = 3
        for i in range(len(list)):
            if i%2==0:
                count_line += 1
                rowdate[list[i][0]] = count_line
                ws.cell(row=count_line, column=1, value=rtn+list[i][0][:5])
            else:
                rowdate[list[i][0]] = count_line
                
        col = 2
        for k1,v1 in dic.items():
            for k2,v2 in v1.items():
                ws.cell(row=1, column=col, value=k1)
                ws.cell(row=2, column=col, value=k2)
                ws.cell(row=3, column=col, value=u'水位')
                ws.cell(row=3, column=col+1, value=u'流量')
    
                for k3,v3 in v2.items():
                    ws.cell(row=rowdate[k3], column=col, value=v3[0])
                    ws.cell(row=rowdate[k3], column=col+1, value=v3[1])
                col+=2
        ws = sheet.create_sheet()
        ws.freeze_panes = ws['A2']
        ws.title = u'警戒水位'
        rows = 1
        ws.cell(row=rows, column=1, value=u'河流')
        ws.cell(row=rows, column=2, value=u'站名')
        ws.cell(row=rows, column=3, value=u'警戒水位')
        for k1,v1 in dic.items():
            for k2,v2 in v1.items():
                rows += 1
                ws.cell(row=rows, column=1, value=k1)
                ws.cell(row=rows, column=2, value=k2)
                for k3,v3 in v2.items():
                    ws.cell(row=rows, column=3, value=v3[2])
                    break
    
        ew.save(filename = fn)
    
    def freeze_panes(fn):
        wr1 = load_workbook(filename = fn)
        wr = wr1.worksheets[0]
        p = wr['B4']
        wr.freeze_panes = p
        wr1.save(fn)
    
    if __name__=="__main__":
        dbname = 'big_rivers201610'
        rDB(dbname)
        print len(rjson)
        pyxlWsheet(rjson,dbname)
        
    

    4 问题

    • Mysql数据库表设计差,写入效率太低。

  • 相关阅读:
    Oracle2MySQL数据校验
    MySQL物理删除大表文件的方式
    MySQL恢复误删除的ibd文件
    如何下载MySQL企业版数据库
    C++篇之成员函数注册为回调函数
    Java入门篇
    日常收藏之Linux篇
    实现简单网页rtmp直播:nginx+ckplayer+linux
    网络通信之accept函数简述
    网络通信之bind、listen函数简述
  • 原文地址:https://www.cnblogs.com/oucbl/p/6423073.html
Copyright © 2020-2023  润新知