• python爬取疫情数据详解


    首先逐步分析每行代码的意思:

    这是要引入的东西:

    from os import path
    import requests
    from bs4 import BeautifulSoup
    import json
    import pymysql
    import numpy as np
    import time

    输入请求地址:

    #请求地址
    url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'

    为了防止被反爬虫(伪装成浏览器):

    #为了避免反爬,伪装成浏览器:
    #创建头部信息
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    response =  requests.get(url,headers = headers)  #发送网络请求

    页面输出显示信息:

    #print(response.content.decode('utf-8'))#以字节流形式打印网页源码
    content = response.content.decode('utf-8')
    #print(content)

    进行解析:

    soup = BeautifulSoup(content, 'html.parser')#指定Beautiful的解析器为“html.parser”

    之后就是对于数组的处理:

    '''*find()
    返回的是第一个匹配的标签结果
    *find_all()
    返回的是所有匹配结果的列表'''
    listA = soup.find_all(name='script',attrs={"id":"getAreaStat"})
    #世界确诊
    listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"})
    account = str(listA)#转化成字符串
    messages = account[52:-21]#截取从52到后边倒数21个

    转换类型:

    messages_json = json.loads(messages)#json.loads 用于解码 JSON 数据。该函数返回 Python 字段的数据类型。

    之后就是线管的数据传入list然后对数据库进行操作了。

    具体的代码如下:

    from os import path
    import requests
    from bs4 import BeautifulSoup
    import json
    import pymysql
    import numpy as np
    import time
    #请求地址
    url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'
    #为了避免反爬,伪装成浏览器:
    #创建头部信息
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    response =  requests.get(url,headers = headers)  #发送网络请求
    #print(response.content.decode('utf-8'))#以字节流形式打印网页源码
    content = response.content.decode('utf-8')
    #print(content)
    soup = BeautifulSoup(content, 'html.parser')#指定Beautiful的解析器为“html.parser”
    listA = soup.find_all(name='script',attrs={"id":"getAreaStat"})
    #世界确诊
    listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"})
    #listA = soup.find_all(name='div',attrs={"class":"c-touchable-feedback c-touchable-feedback-no-default"})
    account = str(listA)
    #world_messages = str(listB)[87:-21]
    messages = account[52:-21]
    messages_json = json.loads(messages)
    #world_messages_json = json.loads(world_messages)
    valuesList = []
    cityList = []
    '''
    worldList = []
    for k in range(len(world_messages_json)):
        worldvalue = (world_messages_json[k].get('id'),world_messages_json[k].get('createTime'),world_messages_json[k].get('modifyTime'),world_messages_json[k].get('tags'),
                 world_messages_json[k].get('countryType'),world_messages_json[k].get('continents'),world_messages_json[k].get('provinceId'),world_messages_json[k].get('provinceName'),
                 world_messages_json[k].get('provinceShortName'),world_messages_json[k].get('cityName'),world_messages_json[k].get('currentConfirmedCount'),world_messages_json[k].get('confirmedCount'),
                 world_messages_json[k].get('suspectedCount'),world_messages_json[k].get('curedCount'),world_messages_json[k].get('deadCount'),world_messages_json[k].get('locationId'),
                 world_messages_json[k].get('countryShortCode'),)
        worldList.append(worldvalue)
    '''
    con=len(messages_json)
    k=0
    for i in range(len(messages_json)):
        #value = messages_json[i]
        k=k+1
        value = (k,time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),messages_json[i].get('provinceShortName'),None,messages_json[i].get('confirmedCount'),messages_json[i].get('suspectedCount'),messages_json[i].get('curedCount'),messages_json[i].get('deadCount'),messages_json[i].get('locationId'))
        valuesList.append(value)
        cityValue = messages_json[i].get('cities')
        #print(cityValue)
        for j in range(len(cityValue)):
            con=con+1
            cityValueList = (con,time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),messages_json[i].get('provinceShortName'),cityValue[j].get('cityName'),cityValue[j].get('confirmedCount'),cityValue[j].get('suspectedCount'),cityValue[j].get('curedCount'),cityValue[j].get('deadCount'),cityValue[j].get('locationId'))
            #print(cityValueList)
            cityList.append(cityValueList)
        #cityList.append(cityValue)
    db = pymysql.connect("localhost", "root", "密码", "ceshi1", charset='utf8')
    cursor = db.cursor()
    array = np.asarray(valuesList[0])
    #sql_clean_world = "TRUNCATE TABLE world_map"
    #sql_clean_city = "TRUNCATE TABLE city_map"
    #sql_clean_json = "TRUNCATE TABLE province_data_from_json"
    sql_clean_province = "TRUNCATE TABLE info3"
    #sql1 = "INSERT INTO city_map values (%s,%s,%s,%s,%s,%s,%s,%s)"
    #sql_world = "INSERT INTO world_map values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    #sql = "INSERT INTO province_map values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
    sql = "INSERT INTO info3 values (%s,%s,%s,%s,%s,%s,%s,%s,%s) "
    #sql = "INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
    #sql = """INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values ('湖北省', '湖北', 43334, 64786, 0, 18889, 2563, '', 420000, 'https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json')"""
    value_tuple = tuple(valuesList)
    cityTuple = tuple(cityList)
    #worldTuple = tuple(worldList)
    #print(cityTuple)
    #print(tuple(value_tuple))
    try:
        #cursor.execute(sql_clean_city)
        cursor.execute(sql_clean_province)
        #cursor.executemany(sql, value_tuple)
        #cursor.executemany(sql1,cityTuple)
        db.commit()
    except:
        print('执行失败,进入回调1')
        db.rollback()
    
    try:
        #cursor.execute(sql_clean_city)
        #cursor.execute(sql_clean_province)
        cursor.executemany(sql, value_tuple)
        #cursor.executemany(sql1,cityTuple)
        db.commit()
    except:
        print('执行失败,进入回调3')
        db.rollback()
    
    try:
        #cursor.execute(sql_clean_city)
        #cursor.execute(sql_clean_province)
        #cursor.executemany(sql, value_tuple)
        cursor.executemany(sql,cityTuple)
        db.commit()
    except:
        print('执行失败,进入回调4')
        db.rollback()
    
    #print(messages_json)
    #print(account[52:-21])
    # soupDiv = BeautifulSoup(listA,'html.parser')
    # listB = soupDiv.find_all(name='div',attrs={"class":"c-gap-bottom-zero c-line-clamp2"})
    #for i in listA:
        #print(i)
    #listA[12]
    #print(listA)
    
    
    db.close()
  • 相关阅读:
    Windows 7下 搭建 基于 ssh 的sftp 服务器
    理解ThreadLocal(二)
    理解ThreadLocal(一)
    Putty使用公钥认证时,报错:Disconnected: No supported authentication methods available(server sent:public key) 问题的解决
    SFTP信任公钥配置及JSCH库
    怎样在WIN7系统下安装IIS
    深入研究B树索引(一)
    【转】EntityFramework(EF)贪婪加载和延迟加载的选择和使用
    【转】Ext JS 集合1713个icon图标的CSS文件
    [转]ASP.NET MVC 4 最佳实践宝典
  • 原文地址:https://www.cnblogs.com/dazhi151/p/12461830.html
Copyright © 2020-2023  润新知