lunwenspidder.py
# -*- coding:utf-8 -*- import requests import re import json import mysql headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36" } def getData(): url = "https://openaccess.thecvf.com/menu" res = requests.get(url).text.replace(' ', '').replace('<br>', '') # print(res) getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>] [<a href="(.*?)/menu.*?">Workshops</a>]</dd>') keyA = re.findall(getA, res) print("会议有"+str(len(keyA))) print(keyA) httpList = [] httpList2 = [] httpList3 = [] ht = [] h = [] t = [] temp = [] for i in keyA: h1 = i[0] h2 = 'https://openaccess.thecvf.com'+i[1] h3 = 'https://openaccess.thecvf.com'+i[2] httpList.append([h1, h2, h3]) # 会议题目 链接 # print(httpList) for i in httpList: url2 = i[2]+'/menu' res2 = requests.get(url2).text.replace('.py', '') print(url2) getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL) keyZ = re.findall(getZ, res2)[0] # print(keyZ) getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL) keyB = re.findall(getB, keyZ) # 2 print(keyB) for k in keyB: h1 = i[2]+'/'+k[0] url4 = h1 print(h1) res4 = requests.get(url4).text getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">') keyX = re.findall(getX, res4) for y in range(len(keyX)): act1 = 'https://openaccess.thecvf.com'+keyX[y] url3 = act1 # 论文链接 print(act1) res2 = requests.get(url3).text.replace(' ', '') getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL) keyC = re.findall(getC, res2) print(keyC) t1 = keyC[0][2] # 作者 t2 = keyC[0][3] # 题目 t3 = keyC[0][4] # 书名 t4 = keyC[0][5] + ',' + keyC[0][6] # 日期 t5 = keyC[0][1] # 摘要 t6 = keyC[0][0] # 链接 temp.append([t1, t2, t3, t4, t5, t6]) mysql.insert_item(temp) temp = [] if __name__ == '__main__': getData()