• 爬取70城房价到oracle数据库并6合1


    学习数据分析,然后没有合适的数据源,从国家统计局的网页上抓取一页数据来玩玩(没有发现robots协议,也仅仅发出一次连接请求,不对网站造成任何负荷)

    运行效果

    imageimage

    源码

    python代码

    '''
    本脚本旨在爬取70城房价进入oracle数据库以供学习
    code by 九命猫幺
    
    网页中有6个表格
        
    最终爬取到数据库中形成6合1报表
    '''
    import requests
    from bs4 import BeautifulSoup
    import numpy as np
    import pandas as pd
    from sqlalchemy import create_engine
    
    #爬取网页
    def getHTMLText(url):
        try:
            headers={'User-Agent':'Baiduspider'}
            r = requests.get(url,headers=headers,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return '产生异常'
    
    #解析出列表
    def getTrText(tbody,tnum):
        uinfo1 = []
        uinfo2 = []
        for i in tbody.strings:
            if i != ' ':
                uinfo1.append(str(i.string).replace('u3000','').replace(' ',''))
        for i in uinfo1:
            if i not in ['皇','岛', '家','庄','丹','江','尔','滨','顶','山']:
                uinfo2.append(i.replace('秦','秦皇岛').replace('石','石家庄').replace('牡','牡丹江').replace('哈','哈尔滨').replace('平','平顶山'))
        uinfo2 = uinfo2[{1:-280,2:-280,3:-350,4:-350,5:-350,6:-350}[tnum]::]
        return uinfo2
    
    #将解析出的列表加工转换传入oracle库
    def toSql(uinfo,tnum):
        if tnum in [1,2]:
            df = pd.DataFrame(np.array(uinfo).reshape(70,4),columns=['city','mom','yoy','fbr'])
        else:
            df = pd.DataFrame(np.array(uinfo).reshape(35,10),columns=['city','mom_90l','yoy_90l','fbr_90l','mom_90t144','yoy_90t144','fbr_90t144','mom_144u','yoy_144u','fbr_144u'])
        con = create_engine('oracle+cx_oracle://edw:oracle@192.168.168.5:1521/?service_name=edw')
        df.to_sql('tb_fj_70city_t'+str(tnum),con,if_exists='replace',index=False)
        
        
    if __name__ == "__main__":
        uinfo = []
        url = 'http://www.stats.gov.cn/tjsj/zxfb/201911/t20191115_1709560.html'
        
        #爬网页
        html = getHTMLText(url) 
        soup = BeautifulSoup(html,'html.parser')
        tbody = soup.select('table.MsoNormalTable tbody')
        #解析存储
        for i in range(6):
            #解析表         
            uinfo = getTrText(tbody[i],i+1)
            #存表入数据库
            toSql(uinfo,i+1)
    

    数据库代码

    --70个大中城市商品住宅销售价格变动情况
    CREATE TABLE tb_fj_70city_201910 AS
    WITH tmp1 AS(
    SELECT to_char(a.city) city,to_number(a.mom) new_mom,to_number(a.yoy) new_yoy,to_number(a.fbr) new_fbr
    FROM tb_fj_70city_t1 a),
    tmp2 AS(
    SELECT to_char(a.city) city,to_number(a.mom) old_mom,to_number(a.yoy) old_yoy,to_number(a.fbr) old_fbr
    FROM tb_fj_70city_t2 a),
    tmp3 AS(
    SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
    to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
    to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
    FROM tb_fj_70city_t3 a
    UNION
    SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
    to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
    to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
    FROM tb_fj_70city_t4 a),
    tmp4 AS(
    SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
    to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
    to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
    FROM tb_fj_70city_t5 a
    UNION
    SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
    to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
    to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
    FROM tb_fj_70city_t6 a)
    SELECT 201910 month,aa.city,aa.new_mom,aa.new_yoy,aa.new_fbr,bb. old_mom,bb.old_yoy,bb.old_fbr,
    cc.new_mom_90l,cc.new_yoy_90l,cc.new_fbr_90l,
    cc.new_mom_90t144,cc.new_yoy_90t144,cc.new_fbr_90t144,
    cc.new_mom_144u,cc.new_yoy_144u,cc.new_fbr_144u,
    dd.old_mom_90l,dd.old_yoy_90l,dd.old_fbr_90l,
    dd.old_mom_90t144,dd.old_yoy_90t144,dd.old_fbr_90t144,
    dd.old_mom_144u,dd.old_yoy_144u,dd.old_fbr_144u
    FROM tmp1 aa
    JOIN tmp2 bb ON aa.city=bb.city
    JOIN tmp3 cc ON aa.city=cc.city
    JOIN tmp4 dd ON aa.city=dd.city;
    
    CALL p_drop_table_if_exist('tb_fj_70city_t1');
    CALL p_drop_table_if_exist('tb_fj_70city_t2');
    CALL p_drop_table_if_exist('tb_fj_70city_t3');
    CALL p_drop_table_if_exist('tb_fj_70city_t4');
    CALL p_drop_table_if_exist('tb_fj_70city_t5');
    CALL p_drop_table_if_exist('tb_fj_70city_t6');
    
    SELECT * FROM tb_fj_70city_201910;
    

    就这样,表名中列名,取英文首字母:

    mom:month on month ,环比

    yoy:year on year,同比

    fbr:fixed base ratio,定基比

    90l:90 lower,90平米以下

    144u:144 upper,144平米以上

    90t144:90 to 144,90到144平米之间

    优化后

    上述脚本只能爬取一个月的,并且6表合1操作在数据库中执行,现在优化为批量爬取多个月份的数据


    '''
    本脚本旨在爬取70城房价进入oracle数据库以供学习
    code by 九命猫幺
    
    网页中有6个表格
        
    最终爬取到数据库中形成6合1报表
    
    网址:
    '''
    import requests
    from bs4 import BeautifulSoup
    import numpy as np
    import pandas as pd
    from sqlalchemy import create_engine
    import cx_Oracle
    
    #爬取网页
    def getHTMLText(url):
        try:
            headers={'User-Agent':'Baiduspider'}
            r = requests.get(url,headers=headers,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return '产生异常'
    
    #解析出列表
    def getTrText(tbody,tnum):
        uinfo1 = []
        uinfo2 = []
        for i in tbody.strings:
            if i != ' ':
                uinfo1.append(str(i.string).replace('u3000','').replace(' ',''))
        for i in uinfo1:
            if i not in ['皇','岛', '家','庄','丹','江','尔','滨','顶','山']:
                uinfo2.append(i.replace('秦','秦皇岛').replace('石','石家庄').replace('牡','牡丹江').replace('哈','哈尔滨').replace('平','平顶山'))
        uinfo2 = uinfo2[{1:-280,2:-280,3:-350,4:-350,5:-350,6:-350}[tnum]::]
        return uinfo2
    
    #将解析出的列表加工转换传入oracle库
    def toSql(uinfo,tnum):
        if tnum in [1,2]:
            df = pd.DataFrame(np.array(uinfo).reshape(70,4),columns=['city','mom','yoy','fbr'])
        else:
            df = pd.DataFrame(np.array(uinfo).reshape(35,10),columns=['city','mom_90l','yoy_90l','fbr_90l','mom_90t144','yoy_90t144','fbr_90t144','mom_144u','yoy_144u','fbr_144u'])
        con = create_engine('oracle+cx_oracle://edw:oracle@192.168.168.5:1521/?service_name=edw')
        df.to_sql('tb_fj_70city_t'+str(tnum),con,if_exists='replace',index=False)
      
    #6合1 并插入历史宽表
    def intoWideTable(month):
        con = cx_Oracle.connect('edw','oracle','192.168.168.5:1521/edw')
        cur = con.cursor()
        cur.execute("CALL p_drop_table_if_exist('tb_fj_70city_"+str(month)+"')")
        cur.execute('''CREATE TABLE tb_fj_70city_'''+str(month)+''' AS
    WITH tmp1 AS(
    SELECT to_char(a.city) city,to_number(a.mom) new_mom,to_number(a.yoy) new_yoy,to_number(a.fbr) new_fbr
    FROM tb_fj_70city_t1 a),
    tmp2 AS(
    SELECT to_char(a.city) city,to_number(a.mom) old_mom,to_number(a.yoy) old_yoy,to_number(a.fbr) old_fbr
    FROM tb_fj_70city_t2 a),
    tmp3 AS(
    SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
    to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
    to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
    FROM tb_fj_70city_t3 a
    UNION
    SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
    to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
    to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
    FROM tb_fj_70city_t4 a),
    tmp4 AS(
    SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
    to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
    to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
    FROM tb_fj_70city_t5 a
    UNION
    SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
    to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
    to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
    FROM tb_fj_70city_t6 a)
    SELECT '''+str(month)+''' month,aa.city,aa.new_mom,aa.new_yoy,aa.new_fbr,bb. old_mom,bb.old_yoy,bb.old_fbr,
    cc.new_mom_90l,cc.new_yoy_90l,cc.new_fbr_90l,
    cc.new_mom_90t144,cc.new_yoy_90t144,cc.new_fbr_90t144,
    cc.new_mom_144u,cc.new_yoy_144u,cc.new_fbr_144u,
    dd.old_mom_90l,dd.old_yoy_90l,dd.old_fbr_90l,
    dd.old_mom_90t144,dd.old_yoy_90t144,dd.old_fbr_90t144,
    dd.old_mom_144u,dd.old_yoy_144u,dd.old_fbr_144u
    FROM tmp1 aa
    JOIN tmp2 bb ON aa.city=bb.city
    JOIN tmp3 cc ON aa.city=cc.city
    JOIN tmp4 dd ON aa.city=dd.city''')
        cur.close()
        con.close()
        
    if __name__ == "__main__":
        uinfo = []
        urls = {201910:'http://www.stats.gov.cn/tjsj/zxfb/201911/t20191115_1709560.html',
                201909:'http://www.stats.gov.cn/tjsj/zxfb/201910/t20191021_1704063.html',
                201908:'http://www.stats.gov.cn/tjsj/zxfb/201909/t20190917_1697943.html',
                201907:'http://www.stats.gov.cn/statsinfo/auto2074/201908/t20190815_1691536.html',
                201906:'http://www.stats.gov.cn/tjsj/zxfb/201907/t20190715_1676000.html',
                201905:'http://www.stats.gov.cn/tjsj/zxfb/201906/t20190618_1670960.html',
                201904:'http://www.stats.gov.cn/tjsj/zxfb/201905/t20190516_1665286.html',
                201903:'http://www.stats.gov.cn/tjsj/zxfb/201904/t20190416_1659682.html'
                }
        for key in urls:
        #爬网页
            html = getHTMLText(urls[key]) 
            soup = BeautifulSoup(html,'html.parser')
            tbody = soup.select('table.MsoNormalTable tbody')
            #解析存储
            for i in range(6):
                #解析表
                uinfo = getTrText(tbody[i],i+1)
                #存表入数据库
                toSql(uinfo,i+1)
            #存入宽表
            intoWideTable(key)
    

    image数据库中同时得到了多个月份的

    再优化单一月份爬取的代码


    import requests
    from bs4 import BeautifulSoup
    import numpy as np
    import pandas as pd
    from sqlalchemy import create_engine
    import cx_Oracle
     
    #爬取网页
    def getHTMLText(url):
        try:
            headers={'User-Agent':'Baiduspider'}
            r = requests.get(url,headers=headers,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return '产生异常'
     
    #解析出列表
    def getTrText(tbody,tnum):
        uinfo1 = []
        uinfo2 = ['...']
        for i in tbody.strings:
            if i not in ['  ',' ']:
                uinfo1.append(str(i.string).replace('  ',''))
        for i in uinfo1:
            if 'u4e00' <= i <= 'u9fa5' and 'u4e00' <= uinfo2[-1] <= 'u9fa5':
                uinfo2[-1]=uinfo2[-1]+i
            else:
                uinfo2.append(i)
        uinfo2 = uinfo2[{1:-280,2:-280}.get(tnum,-350)::]
        return uinfo2
     
    #将解析出的列表加工转换传入oracle库
    def toSql(uinfo,tnum):
        if tnum in [1,2]:
            df = pd.DataFrame(np.array(uinfo).reshape(70,4),columns=['city','mom','yoy','fbr'])
        else:
            df = pd.DataFrame(np.array(uinfo).reshape(35,10),columns=['city','mom_90l','yoy_90l','fbr_90l','mom_90t144','yoy_90t144','fbr_90t144','mom_144u','yoy_144u','fbr_144u'])
        con = create_engine('oracle+cx_oracle://edw:oracle@192.168.168.5:1521/?service_name=edw')
        df.to_sql('tb_fj_70city_t'+str(tnum),con,if_exists='replace',index=False)
         
    #6合1 并插入历史宽表
    def intoWideTable(month):
        con = cx_Oracle.connect('edw','oracle','192.168.168.5:1521/edw')
        cur = con.cursor()
        cur.execute("CALL p_drop_table_if_exist('tb_fj_70city_"+str(month)+"')")
        cur.execute('''CREATE TABLE tb_fj_70city_'''+str(month)+''' AS
    WITH tmp1 AS(
    SELECT to_char(a.city) city,to_number(a.mom) new_mom,to_number(a.yoy) new_yoy,to_number(a.fbr) new_fbr
    FROM tb_fj_70city_t1 a),
    tmp2 AS(
    SELECT to_char(a.city) city,to_number(a.mom) old_mom,to_number(a.yoy) old_yoy,to_number(a.fbr) old_fbr
    FROM tb_fj_70city_t2 a),
    tmp3 AS(
    SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
    to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
    to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
    FROM tb_fj_70city_t3 a
    UNION
    SELECT to_char(a.city) city,to_number(a.mom_90l) new_mom_90l,to_number(a.yoy_90l) new_yoy_90l,to_number(a.fbr_90l) new_fbr_90l,
    to_number(a.mom_90t144) new_mom_90t144,to_number(a.yoy_90t144) new_yoy_90t144,to_number(a.fbr_90t144) new_fbr_90t144,
    to_number(a.mom_144u) new_mom_144u,to_number(a.yoy_144u) new_yoy_144u,to_number(a.fbr_144u) new_fbr_144u
    FROM tb_fj_70city_t4 a),
    tmp4 AS(
    SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
    to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
    to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
    FROM tb_fj_70city_t5 a
    UNION
    SELECT to_char(a.city) city,to_number(a.mom_90l) old_mom_90l,to_number(a.yoy_90l) old_yoy_90l,to_number(a.fbr_90l) old_fbr_90l,
    to_number(a.mom_90t144) old_mom_90t144,to_number(a.yoy_90t144) old_yoy_90t144,to_number(a.fbr_90t144) old_fbr_90t144,
    to_number(a.mom_144u) old_mom_144u,to_number(a.yoy_144u) old_yoy_144u,to_number(a.fbr_144u) old_fbr_144u
    FROM tb_fj_70city_t6 a)
    SELECT '''+str(month)+''' month,aa.city,aa.new_mom,aa.new_yoy,aa.new_fbr,bb. old_mom,bb.old_yoy,bb.old_fbr,
    cc.new_mom_90l,cc.new_yoy_90l,cc.new_fbr_90l,
    cc.new_mom_90t144,cc.new_yoy_90t144,cc.new_fbr_90t144,
    cc.new_mom_144u,cc.new_yoy_144u,cc.new_fbr_144u,
    dd.old_mom_90l,dd.old_yoy_90l,dd.old_fbr_90l,
    dd.old_mom_90t144,dd.old_yoy_90t144,dd.old_fbr_90t144,
    dd.old_mom_144u,dd.old_yoy_144u,dd.old_fbr_144u
    FROM tmp1 aa
    JOIN tmp2 bb ON aa.city=bb.city
    JOIN tmp3 cc ON aa.city=cc.city
    JOIN tmp4 dd ON aa.city=dd.city''')
        cur.execute('delete from tb_fj_70city where month='+str(month))
        cur.execute('insert into tb_fj_70city select * from tb_fj_70city_'+str(month))
        con.commit()
        cur.close()
        con.close()
         
    if __name__ == "__main__":
        uinfo = []
        month = 201910
        url = 'http://www.stats.gov.cn/tjsj/zxfb/201911/t20191115_1709560.html'
         
        #爬网页
        html = getHTMLText(url)
        soup = BeautifulSoup(html,'html.parser')
        tbody = soup.select('table.MsoNormalTable tbody')
        #解析存储
        for i in range(6):
            #解析表        
            uinfo = getTrText(tbody[i],i+1)
            #存表入数据库
            toSql(uinfo,i+1)
        intoWideTable(month)
    
    
    

    ok了

  • 相关阅读:
    力扣Leetcode 3. 无重复字符的最长子串
    力扣Leetcode 21. 合并两个有序链表
    力扣Leetcode 202. 快乐数 -快慢指针 快乐就完事了
    力扣Leetcode 面试题56
    力扣Leetcode 33. 搜索旋转排序数组
    力扣Leetcode 46. 全排列
    python123期末四题编程题 -无空隙回声输出-文件关键行数-字典翻转输出-《沉默的羔羊》之最多单词
    深入理解ReentrantLock的实现原理
    深入图解AQS实现原理和源码分析
    Java:CAS(乐观锁)
  • 原文地址:https://www.cnblogs.com/yongestcat/p/11976852.html
Copyright © 2020-2023  润新知