爬取这个网站:https://yq.aliyun.com/articles/
# -*- coding: utf-8 -*- import requests import re import time from parsel import Selector key = "Python" url = "https://yq.aliyun.com/search/articles/" hds = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"} response = requests.get(url, headers=hds, params={"q":key}) response.encoding = response.apparent_encoding data = response.text pat1= '<div class="_search-info">找到(.*?)条关于' alline = re.compile(pat1, re.S).findall(data)[0] allpage = int(alline) // 15 + 1 #print(allpage) for i in range(0, int(allpage)): print("----正在爬第" + str(i + 1) + "页------") index = str(i + 1) getdata = {"q" : key , "p" : index, } data = requests.get(url, params=getdata).text # pat_url = '<div class="media-body text-overflow">.*?<a href="(.*?)">' # articles = re.compile(pat_url, re.S).findall(data) sel = Selector(data) articles = sel.xpath("//div[@class='media-body text-overflow']/a/@href").getall() for j in articles: thisurl = "https://yq.aliyun.com" + j # print(thisurl) thisdata = requests.get(thisurl).text title = Selector(thisdata).xpath("//p[@class='hiddenTitle']/text()").get() print(title) # pat_content = Selector(thisdata).xpath("//div[@class='content-detail unsafe markdown-body']/text()").get() pat_content = '<div class="content-detail unsafe markdown-body">(.*)</div>' content = re.compile(pat_content, re.S).findall(thisdata)[0] print(content) with open('./aliyun/' + str(title) + '.html', 'w', encoding='utf8') as f: f.write(title + "<br /><br />" + content)