• 个人冲刺第四天6.10


    # -*- coding:utf-8 -*-
    import requests
    import re
    import json
    import Mysql

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }


    def getData():
    url = "https://openaccess.thecvf.com/menu"
    res = requests.get(url).text.replace(' ', '').replace('<br>', '')
    # print(res)
    getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>] [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
    keyA = re.findall(getA, res)
    print("会议有"+str(len(keyA)))
    print(keyA)
    httpList = []
    httpList2 = []
    httpList3 = []
    ht = []
    h = []
    t = []
    temp = []
    for i in keyA:
    h1 = i[0]
    h2 = 'https://openaccess.thecvf.com'+i[1]
    h3 = 'https://openaccess.thecvf.com'+i[2]
    httpList.append([h1, h2, h3]) # 会议题目 链接
    # print(httpList)
    for i in httpList:
    url2 = i[2]+'/menu'
    res2 = requests.get(url2).text.replace('.py', '')
    print(url2)
    getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
    keyZ = re.findall(getZ, res2)[0]
    # print(keyZ)
    getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
    keyB = re.findall(getB, keyZ) # 2
    print(keyB)
    for k in keyB:
    h1 = i[2]+'/'+k[0]
    url4 = h1
    print(h1)
    res4 = requests.get(url4).text
    getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
    keyX = re.findall(getX, res4)
    for y in range(len(keyX)):
    act1 = 'https://openaccess.thecvf.com'+keyX[y]
    url3 = act1 # 论文链接
    print(act1)
    res2 = requests.get(url3).text.replace(' ', '')
    getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
    keyC = re.findall(getC, res2)
    print(keyC)
    t1 = keyC[0][2] # 作者
    t2 = keyC[0][3] # 题目
    t3 = keyC[0][4] # 书名
    t4 = keyC[0][5] + ',' + keyC[0][6] # 日期
    t5 = keyC[0][1] # 摘要
    t6 = keyC[0][0] # 链接

    temp.append([t1, t2, t3, t4, t5, t6])
    Mysql.insert_item(temp)
    temp = []


    if __name__ == '__main__':
    getData()
  • 相关阅读:
    Thread Based Parallelism
    Thread Based Parallelism
    The Divide and Conquer Approach
    Algorithms
    FTP
    POP and IMAP
    通过 python 处理 email
    Android开发环境搭建简介
    Hello world
    mybatis3.2初学感悟
  • 原文地址:https://www.cnblogs.com/wanghaoning/p/14914544.html
Copyright © 2020-2023  润新知