• 论文爬取(一)


    论文爬取爬虫

    # -*- coding:utf-8 -*-
    import requests
    import re
    import json
    import mysql

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }


    def getData():
    url = "https://openaccess.thecvf.com/menu"
    res = requests.get(url).text.replace(' ', '').replace('<br>', '')
    # print(res)
    getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>] [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
    keyA = re.findall(getA, res)
    print("会议有"+str(len(keyA)))
    print(keyA)
    httpList = []
    httpList2 = []
    httpList3 = []
    ht = []
    h = []
    t = []
    temp = []
    for i in keyA:
    h1 = i[0]
    h2 = 'https://openaccess.thecvf.com'+i[1]
    h3 = 'https://openaccess.thecvf.com'+i[2]
    httpList.append([h1, h2, h3]) # 会议题目 链接
    # print(httpList)
    for i in httpList:
    url2 = i[2]+'/menu'
    res2 = requests.get(url2).text.replace('.py', '')
    print(url2)
    getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
    keyZ = re.findall(getZ, res2)[0]
    # print(keyZ)
    getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
    keyB = re.findall(getB, keyZ) # 2
    print(keyB)
    for k in keyB:
    h1 = i[2]+'/'+k[0]
    url4 = h1
    print(h1)
    res4 = requests.get(url4).text
    getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
    keyX = re.findall(getX, res4)
    for y in range(len(keyX)):
    act1 = 'https://openaccess.thecvf.com'+keyX[y]
    url3 = act1 # 论文链接
    print(act1)
    res2 = requests.get(url3).text.replace(' ', '')
    getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
    keyC = re.findall(getC, res2)
    print(keyC)
    t1 = keyC[0][2] # 作者
    t2 = keyC[0][3] # 题目
    t3 = keyC[0][4] # 书名
    t4 = keyC[0][5] + ',' + keyC[0][6] # 日期
    t5 = keyC[0][1] # 摘要
    t6 = keyC[0][0] # 链接

    temp.append([t1, t2, t3, t4, t5, t6])
    mysql.insert_item(temp)
    temp = []


    if __name__ == '__main__':
    getData()
  • 相关阅读:
    博客搬到blog.csgrandeur.com
    CSGrandeur的WebGL学习——WebGL教程
    hihoCoder 1160 攻城略地
    HDU 5212 Code
    Ubuntu 14.04 MySQL同步
    Ubuntu 用vsftpd 配置FTP服务器
    Ubuntu14.04 Server amd64 配置 Apache+MySQL+Django
    LeetCode OJ 题解
    MFC+Android模拟器 实现 自动玩“天天爱消除”
    湖南2013第九届省赛解题报告(长期拖延更新中。。。)
  • 原文地址:https://www.cnblogs.com/mumulailai/p/14912235.html
Copyright © 2020-2023  润新知