爬不到此论坛的html源码,应该涉及到反爬技术,以后再来解决,代码如下
import requests from lxml import etree import json class BtcSpider(object): def __init__(self): self.headers = { "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } self.base_url = "https://www.chainnode.com/forum/2" self.data_list = [] # 获取数据 def get_data(self, url): response = requests.get(url, headers=self.headers) data = response.content.decode() return data # 解析数据 def parse_data(self, data): # 转换数据的类型 x_data = etree.HTML(data) # 按照路径解析 title_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/text()') url_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/@href') url_list = ["https://www.chainnode.com" + i for i in url_list] for index, title in enumerate(title_list): news = {} news['name'] = title news['url'] = url_list[index] self.data_list.append(news) # 保存数据 def save_data(self): data_str = json.dumps(self.data_list) with open('03-btc.html', 'w') as f: f.write(data_str) # 启动 def run(self): for i in range(1,5): # 拼接完整url url = self.base_url if i == 1 else self.base_url + str(-i) # 发送请求 data = self.get_data(url) # 解析数据 self.parse_data(data) self.save_data() BtcSpider().run()