• 热词搜索六


    lunwenspidder.py

    # -*- coding:utf-8 -*-
    import requests
    import re
    import json
    import mysql
    
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"
    }
    
    
    def getData():
        url = "https://openaccess.thecvf.com/menu"
        res = requests.get(url).text.replace('
    ', '').replace('<br>', '')
        # print(res)
        getA = re.compile(r'<dd>(.*?) [<a href="(.*?)">Main Conference</a>]  [<a href="(.*?)/menu.*?">Workshops</a>]</dd>')
        keyA = re.findall(getA, res)
        print("会议有"+str(len(keyA)))
        print(keyA)
        httpList = []
        httpList2 = []
        httpList3 = []
        ht = []
        h = []
        t = []
        temp = []
        for i in keyA:
            h1 = i[0]
            h2 = 'https://openaccess.thecvf.com'+i[1]
            h3 = 'https://openaccess.thecvf.com'+i[2]
            httpList.append([h1, h2, h3])  # 会议题目 链接
        # print(httpList)
        for i in httpList:
                url2 = i[2]+'/menu'
                res2 = requests.get(url2).text.replace('.py', '')
                print(url2)
                getZ = re.compile(r'<dl>(.*?)</dl>', re.DOTALL)
                keyZ = re.findall(getZ, res2)[0]
                # print(keyZ)
                getB = re.compile(r'<a href="/?(?:w+/)?(w+)">(.*?)</a><br><br>.*?</dd>', re.DOTALL)
                keyB = re.findall(getB, keyZ)  # 2
                print(keyB)
                for k in keyB:
                    h1 = i[2]+'/'+k[0]
                    url4 = h1
                    print(h1)
                    res4 = requests.get(url4).text
                    getX = re.compile(r'<dt class="ptitle"><br><a href="(.*?)">')
                    keyX = re.findall(getX, res4)
                    for y in range(len(keyX)):
                        act1 = 'https://openaccess.thecvf.com'+keyX[y]
                        url3 = act1  # 论文链接
                        print(act1)
                        res2 = requests.get(url3).text.replace('
    ', '')
                        getC = re.compile(r'<meta name="citation_pdf_url" content="(.*?)">.*?<div id="abstract">(.*?)</div>.*?authors+=s+{(.*?)}.*?titles+=s+{(.*?)}.*?booktitles+=s+{(.*?)}.*?months+=s+{(.*?)}.*?years+=s+{(.*?)}', re.DOTALL)
                        keyC = re.findall(getC, res2)
                        print(keyC)
                        t1 = keyC[0][2]  # 作者
                        t2 = keyC[0][3]  # 题目
                        t3 = keyC[0][4]  # 书名
                        t4 = keyC[0][5] + ',' + keyC[0][6]  # 日期
                        t5 = keyC[0][1]  # 摘要
                        t6 = keyC[0][0]  # 链接
    
                        temp.append([t1, t2, t3, t4, t5, t6])
                        mysql.insert_item(temp)
                        temp = []
    
    
    if __name__ == '__main__':
        getData()
  • 相关阅读:
    python接口测试3-JSON格式
    python接口测试2-开发WEB接口
    接口测试1-基础
    Apifox接口测试管理工具
    python的pip安装超时问题解决
    ubuntu解决安装速度问题
    vim进入粘贴模式
    禅道数据库
    内存管理
    文件操作
  • 原文地址:https://www.cnblogs.com/zhaoyuxiao000/p/14915297.html
Copyright © 2020-2023  润新知