• 17 中国天气网信息爬取


     1 """中国天气网爬虫"""
     2 
     3 import requests
     4 from bs4 import BeautifulSoup
     5 
     6 HEADERS = {
     7     'User-Agent': 'Mozilla/5.0',
     8 }
     9 
    10 def parse_detail_page(url, is_html5lib):
    11     """爬取具体页面具体数据"""
    12 
    13     respose = requests.get(url, headers=HEADERS)
    14     text = respose.content.decode('utf-8')
    15     # with open('weather.html', 'w', encoding='utf-8') as fp:
    16     #     fp.write(text)
    17     if is_html5lib == False:
    18         soup = BeautifulSoup(text, 'lxml')
    19     else:
    20         soup = BeautifulSoup(text, 'html5lib')
    21     # 以下为具体爬取数据方法
    22     conMidtab = soup.find_all('div', attrs={'class':'conMidtab'})
    23     tables = conMidtab[0].find_all('table')
    24     for table in tables:
    25         trs = table.find_all('tr')[2:]
    26         for index,tr in enumerate(trs):
    27             tds = tr.find_all('td')
    28             city_td = tds[0]
    29             if index == 0:
    30                 city_td = tds[1]
    31             city = list(city_td.stripped_strings)[0]
    32             temp_td = tds[-2]
    33             min_temp = list(temp_td.stripped_strings)[0]
    34             # 输出城市及其最低温度
    35             print({'city': city, 'min_temp': min_temp})
    36 
    37     print("="*40)
    38 
    39 def get_detail_urls(url, base_url):
    40     """得到华北、东北、华东、华中、华南、西北、西南、港澳台的具体页面链接"""
    41 
    42     urllists = []       # 具体的页面信息列表
    43     respose = requests.get(url, headers=HEADERS)
    44     text = respose.content.decode('utf-8')
    45     soup = BeautifulSoup(text, 'lxml')
    46     # 数据爬取
    47     uls = soup.find_all('ul', class_='lq_contentboxTab2')
    48     alists = uls[0].find_all('a')
    49     for list in alists:
    50         newurl = base_url + list['href']
    51         urllists.append(newurl)
    52 
    53     return urllists
    54 
    55 def spider():
    56     """"""
    57 
    58     # 初始爬取页面
    59     src_url = "http://www.weather.com.cn/textFC/hb.shtml"
    60     base_url = "http://www.weather.com.cn"
    61     urllists = []
    62     urllists = get_detail_urls(src_url, base_url)
    63     #print(urllists)
    64     is_html5lib = False     # 爬取页面是否用html5lib库
    65     for index,urllist in enumerate(urllists):
    66         if index != len(urllists)-1:
    67             parse_detail_page(urllist, is_html5lib)
    68         else:
    69             is_html5lib = True
    70             # url = "http://www.weather.com.cn/textFC/gat.shtml"这个页面需要用html5lib库解析,不然数据有错
    71             parse_detail_page(urllist, is_html5lib)
    72 
    73 if __name__ == '__main__':
    74     spider()
  • 相关阅读:
    生成随机数的三种方法
    老外最常说的二十句钻石级英语
    线性探查法实现的散列表(哈希表)类
    STL容器之间可以直接相互赋值使用
    用位向量实现集合抽象数据类型
    一个利用map统计一段英文文章中每个单词出现次数的小程序
    各种排序算法的稳定性和时间复杂度小结
    24佳句对译收藏
    SQL 将一个字段内用逗号分隔的内容分成多条记录
    SQL递归查询(with cte as)
  • 原文地址:https://www.cnblogs.com/sruzzg/p/13096959.html
Copyright © 2020-2023  润新知