import requests url = "https://magi.com/search" querystring = {"q":"堕却乡"} headers = { 'authority': "magi.com", 'pragma': "no-cache", 'cache-control': "no-cache,no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 'referer': "https://magi.com/search?q=%E7%89%B9%E6%96%AF%E6%8B%89", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9", 'cookie': "acw_tc=7af6142615735221487104171e68298facdedf1e07add2205636582990", 'Postman-Token': "dda0d475-41b9-44b4-812a-6dd489fe19dd,64d3ddc4-7036-4c42-bff6-53dcbc065db2", 'Host': "magi.com", 'Connection': "keep-alive" } response = requests.request("GET", url, headers=headers, params=querystring, # verify=True ) # print(response.text) import lxml.etree taxt = lxml.etree.HTML(response.text) cells=taxt.xpath("//main//div[@data-type='fact']//article[@class='fact']") for cell in cells: sop = cell.xpath(".//dl/dd//text()") sop_url = cell.xpath(".//div/ul//ol//li//a//@href") reliability = cell.xpath(".//div//span//text()") import re reliability=re.findall("(d{1,3})",reliability[0])[-1] print(reliability,sop,sop_url) # import requests url = "https://www.tuicool.com/articles/jiyEnq7" headers = { # 'Connection': "keep-alive", # 'Pragma': "no-cache", # 'Cache-Control': "no-cache", # 'Upgrade-Insecure-Requests': "1", 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", # 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", # 'Referer': "https://magi.com/", # 'Accept-Encoding': "gzip, deflate", # 'Accept-Language': "zh-CN,zh;q=0.9", # 'Cookie': "Hm_lvt_28af3b8ab090b6821eea60c696e82b96=1573539698,1573540931; Hm_lpvt_28af3b8ab090b6821eea60c696e82b96=1573540931; ALLYESID4=128D850DE1E5CFA6; wdcid=11576d250e703f68; wdses=274efe2ee2728bdc; zycna=XkZbSr7Ily0BAXPBvUrZ6/aL; wdlast=1573540932", # 'Postman-Token': "3fb8dcac-17e0-431b-bc1e-209ab1e7c2dd,86d4a803-c79a-4949-ac91-1edd3323465e", # 'Host': "www.ce.cn", # 'cache-control': "no-cache" } spo=['特斯拉', '电池供应商', '松下'] import chardet response = requests.request("GET", url, headers=headers) response.encoding=chardet.detect((response.content))["encoding"] # response.encoding="utf-8" import lxml.etree taxt = lxml.etree.HTML(response.text) list_sentence=taxt.xpath("//body//text()") import re # for t in t_list: # t = re.split("(。|!|?)",t) # if len(t)>1: # t = ["".join(i) for i in zip(t[0::2], t[1::2])] # print(t) spo_sentence = [] for sentence in list_sentence: sentence_list = re.split("(。|!|?)", sentence) if len(sentence_list) > 1: sentence_list = ["".join(i) for i in zip(sentence_list[0::2], sentence_list[1::2])] for sentence in sentence_list: if spo[1]=="描述" or spo[1]=="标签" or spo[1]=="近义项": if sentence.find(spo[0])!=-1 and sentence.find(spo[2])!=-1: spo_sentence.append(sentence) print(sentence) else: if sentence.find(spo[0])!=-1 and sentence.find(spo[1])!=-1 and sentence.find(spo[2])!=-1: spo_sentence.append(sentence) print(sentence) if spo_sentence: item = { "spo_sentence": spo_sentence, "spo": spo } print(item)