论文爬取（一）

论文爬取爬虫

# -*- coding:utf-8 -*-
import requests
import re
import json
import mysql

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
}


def getData():
    url = "https://openaccess.thecvf.com/menu"
    res = requests.get(url).text.replace('
', '').replace('<br>', '')
    # print(res)
    getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>]  [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
    keyA = re.findall(getA, res)
    print("会议有"+str(len(keyA)))
    print(keyA)
    httpList = []
    httpList2 = []
    httpList3 = []
    ht = []
    h = []
    t = []
    temp = []
    for i in keyA:
        h1 = i[0]
        h2 = 'https://openaccess.thecvf.com'+i[1]
        h3 = 'https://openaccess.thecvf.com'+i[2]
        httpList.append([h1, h2, h3])  # 会议题目 链接
    # print(httpList)
    for i in httpList:
            url2 = i[2]+'/menu'
            res2 = requests.get(url2).text.replace('.py', '')
            print(url2)
            getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
            keyZ = re.findall(getZ, res2)[0]
            # print(keyZ)
            getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
            keyB = re.findall(getB, keyZ)  # 2
            print(keyB)
            for k in keyB:
                h1 = i[2]+'/'+k[0]
                url4 = h1
                print(h1)
                res4 = requests.get(url4).text
                getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
                keyX = re.findall(getX, res4)
                for y in range(len(keyX)):
                    act1 = 'https://openaccess.thecvf.com'+keyX[y]
                    url3 = act1  # 论文链接
                    print(act1)
                    res2 = requests.get(url3).text.replace('
', '')
                    getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
                    keyC = re.findall(getC, res2)
                    print(keyC)
                    t1 = keyC[0][2]  # 作者
                    t2 = keyC[0][3]  # 题目
                    t3 = keyC[0][4]  # 书名
                    t4 = keyC[0][5] + ',' + keyC[0][6]  # 日期
                    t5 = keyC[0][1]  # 摘要
                    t6 = keyC[0][0]  # 链接

                    temp.append([t1, t2, t3, t4, t5, t6])
                    mysql.insert_item(temp)
                    temp = []


if __name__ == '__main__':
    getData()

相关阅读:
博客搬到blog.csgrandeur.com
CSGrandeur的WebGL学习——WebGL教程
 hihoCoder 1160 攻城略地
 HDU 5212 Code
Ubuntu 14.04 MySQL同步
 Ubuntu 用vsftpd 配置FTP服务器
 Ubuntu14.04 Server amd64 配置 Apache+MySQL+Django
LeetCode OJ 题解
 MFC+Android模拟器实现自动玩“天天爱消除”
湖南2013第九届省赛解题报告（长期拖延更新中。。。）
原文地址：https://www.cnblogs.com/mumulailai/p/14912235.html