https://www.aqistudy.cn/historydata/
分析思路:
- 先判断是不是动态加载的数据
- 找城市标签的定位,先熟悉源码
url = "https://www.aqistudy.cn/historydata/" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"} url_text = requests.get(url = url,headers=headers).text tree = etree.HTML(url_text) # 热门城市: //div[@class="hot"]/ul/li/a/text() # 全部城市: div[@class="bottom"]/ul/div[2]/li/a/text() city_name_list = tree.xpath('//div[@class="hot"]/ul/li/a/text()|//div[@class="bottom"]/ul/div[2]/li/a/text()') print(city_name_list)
import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } response = requests.get(url=url,headers=headers) #获取页面原始编码格式 print(response.encoding) page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul//li') for li in li_list: city_name = li.xpath('./a/text()')[0] city_url = 'https://www.aqistudy.cn/historydata/'+li.xpath('./a/@href')[0] print(city_name,city_url)