• 18 中国天气网信息爬取(排序并可视化显示)


     1 """中国天气网爬虫"""
     2 
     3 import requests
     4 from bs4 import BeautifulSoup
     5 from pyecharts import Bar
     6 
     7 
     8 HEADERS = {
     9     'User-Agent': 'Mozilla/5.0'
    10 }
    11 
    12 ALL_DATA = []       # 所有爬取的数据
    13 
    14 def parse_detail_page(url, is_html5lib):
    15     """爬取具体页面具体数据"""
    16 
    17     respose = requests.get(url, headers=HEADERS)
    18     text = respose.content.decode('utf-8')
    19     # with open('weather.html', 'w', encoding='utf-8') as fp:
    20     #     fp.write(text)
    21     if is_html5lib == False:
    22         soup = BeautifulSoup(text, 'lxml')
    23     else:
    24         soup = BeautifulSoup(text, 'html5lib')
    25     # 以下为具体爬取数据方法
    26     conMidtab = soup.find_all('div', attrs={'class':'conMidtab'})
    27     tables = conMidtab[0].find_all('table')
    28     for table in tables:
    29         trs = table.find_all('tr')[2:]
    30         for index,tr in enumerate(trs):
    31             tds = tr.find_all('td')
    32             city_td = tds[0]
    33             if index == 0:
    34                 city_td = tds[1]
    35             city = list(city_td.stripped_strings)[0]
    36             temp_td = tds[-2]
    37             min_temp = list(temp_td.stripped_strings)[0]
    38             # 存储爬取数据,把最低温度转成整型,排序需要
    39             ALL_DATA.append({'city': city, 'min_temp': int(min_temp)})
    40 
    41 def data_visualization():
    42     """取出前10最低温度并用柱状图显示"""
    43 
    44     # ALL_DATA.sort(key=lambda data:data['min_temp'], reverse=True)
    45     ALL_DATA.sort(key=lambda data: data['min_temp'])
    46     #print(ALL_DATA)
    47     #print(len(ALL_DATA))
    48     # 获取前10
    49     data = ALL_DATA[0:10]
    50     print(data)
    51     cities = list(map(lambda x:x['city'], data))
    52     temps = list(map(lambda x:x['min_temp'], data))
    53     chart = Bar("中国天气最低气温排行榜")
    54     chart.add('', cities, temps)
    55     chart.render('weather.html')
    56 
    57 
    58 def get_detail_urls(url, base_url):
    59     """得到华北、东北、华东、华中、华南、西北、西南、港澳台的具体页面链接"""
    60 
    61     urllists = []       # 具体的页面信息列表
    62     respose = requests.get(url, headers=HEADERS)
    63     text = respose.content.decode('utf-8')
    64     soup = BeautifulSoup(text, 'lxml')
    65     # 数据爬取
    66     uls = soup.find_all('ul', class_='lq_contentboxTab2')
    67     alists = uls[0].find_all('a')
    68     for list in alists:
    69         newurl = base_url + list['href']
    70         urllists.append(newurl)
    71 
    72     return urllists
    73 
    74 def spider():
    75     """"""
    76 
    77     # 初始爬取页面
    78     src_url = "http://www.weather.com.cn/textFC/hb.shtml"
    79     base_url = "http://www.weather.com.cn"
    80     urllists = []
    81     urllists = get_detail_urls(src_url, base_url)
    82     #print(urllists)
    83     is_html5lib = False     # 爬取页面是否用html5lib库
    84     for index,urllist in enumerate(urllists):
    85         if index != len(urllists)-1:
    86             parse_detail_page(urllist, is_html5lib)
    87         else:
    88             is_html5lib = True
    89             # url = "http://www.weather.com.cn/textFC/gat.shtml"这个页面需要用html5lib库解析,不然数据有错
    90             parse_detail_page(urllist, is_html5lib)
    91 
    92     # 排序并可视化数据
    93     data_visualization()
    94 
    95 if __name__ == '__main__':
    96     spider()

    注意:需要安装 pyecharts

    建议 pip intstall pyecharts==0.1.9.5

    https://blog.csdn.net/Nurbiya_K/article/details/105354670

    https://www.jianshu.com/p/b718c307a61c

  • 相关阅读:
    ACM-ICPC 2018 徐州赛区网络预赛 D 杜教筛 前缀和
    51 Nod 1244 莫比乌斯函数前n项和
    20170914-构建之法:现代软件工程-阅读笔记
    结对-五子棋游戏-开发环境搭建过程
    结对-五子棋游戏-设计文档
    Git使用方法2.0
    团队-象棋游戏-团队信息
    Web开发技术——JQuery4(隐藏和显示、淡入和淡出、滑动)
    Web开发技术——JQuery3(事件)
    Web开发技术——JQuery2(语法和选择器)
  • 原文地址:https://www.cnblogs.com/sruzzg/p/13100815.html
Copyright © 2020-2023  润新知