1、通过爬取历史首页,来获取城市地址和历史时间,构建链接;
''' 获取全国的城市名称和链接 ''' import requests from lxml import etree import random import pymongo from time_list import get_time client = pymongo.MongoClient('localhost',27017) tianqi_data = client['tianqi_data'] time_url_table = tianqi_data['time_url_table'] headers_data = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', ] headers = { 'User-Agent':random.choice(headers_data) } def get_cityname(url): #爬取城市名称,并保存到数据到列表中 city_name_list = [] city_response = requests.get(url,headers = headers) city_response.encoding = city_response.apparent_encoding city_names = etree.HTML(city_response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/text()') city_links = etree.HTML(city_response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/@href') for city_name,city_link in zip(city_names,city_links): if len(city_name) == 1: pass else: city_data = { 'city_name':str(city_name), 'city_link':str(city_link), } city_name_list.append(city_data) return city_name_list #print(city_name_list) print('获取城市名称和链接结束...') url = 'http://lishi.tianqi.com/' for link in get_cityname(url): #构建每个城市的历史日期链接,并保存到数据库中 url = link['city_link'] for time_link in get_time(): time = time_link.split('/')[-1].split('.')[0] time_url = url.replace('index',str(time)) data = { 'time_url':time_url, 'city':link['city_name'], } print(data) time_url_table.insert(data) print('导入数据库存完成')
import requests from lxml import etree ''' 通过对比城市的链接和历史时间的链接发现,就是在把城市链接里面的index换成了相对应的时间, 所以只要把index换成了历史月份就可以了 ''' def get_time(): url = 'http://lishi.tianqi.com/acheng/index.html' response = requests.get(url) time_lists = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/@href') return time_lists
2、从数据库中读取数据,爬取每个城市的历史天气数据;
import requests from lxml import etree import random import pymongo client = pymongo.MongoClient('localhost',27017) tianqi_data = client['tianqi_data'] time_url_table = tianqi_data['time_url_table'] tianqi_data_table = tianqi_data['tianqi_data_table'] headers_data = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', ] headers = { 'User-Agent':random.choice(headers_data) } def get_tianqi_data(): for link in time_url_table.find(): url = link['time_url'] print(url) response = requests.get(url,headers=headers) dates = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/text()') max_temps = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[2]/text()')[1:-1] low_temps = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[3]/text()')[1:-1] weathers = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[4]/text()')[1:-1] fengxiangs = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[5]/text()')[1:-1] fenglis = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[6]/text()')[1:-1] for date,max_temp,low_temp,weather,fengxiang,fengli in zip(dates,max_temps,low_temps,weathers,fengxiangs,fenglis): data = { '日期':date, '最高温度':max_temp, '最低温度':low_temp, '天气':weather, '风向':fengxiang, '风力':fengli, } tianqi_data_table.insert(data) print(data) print('爬取数据成功')