• python爬取论文


    python实现爬取论文的信息:

     1 import requests
     2 import pymysql
     3 from jieba.analyse import extract_tags
     4 from lxml import etree  # 导入库
     5 from bs4 import BeautifulSoup
     6 import re
     7 import time
     8 
     9 db = pymysql.connect(host="localhost", user="root", passwd="0424wyhhxx", database="test", charset='utf8')
    10 cursor = db.cursor()
    11 
    12 
    13 # 定义爬虫类
    14 
    15 class Spider():
    16     def __init__(self): 
    17         self.url = 'https://openaccess.thecvf.com/CVPR2018?day=2018-06-19'
    18         self.headers = {
    19             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 '
    20                           'Safari/537.36 '
    21         }
    22         r = requests.get(self.url, headers=self.headers)
    23         r.encoding = r.apparent_encoding
    24         self.html = r.text
    25 
    26     def lxml_find(self):
    27         '''用lxml解析'''
    28         tonum = 200
    29         number = 1
    30         num = 1
    31         start = time.time()  # 三种方式速度对比
    32         selector = etree.HTML(self.html)  # 转换为lxml解析的对象
    33         titles = selector.xpath('//dt[@class="ptitle"]/a/@href')  # 这里返回的是一个列表
    34         for each in titles[200:]:
    35             title0 = each.strip()  # 去掉字符左右的空格
    36             # print("https://openaccess.thecvf.com/content_CVPR_2019"+title[17:])
    37             chaolianjie = "https://openaccess.thecvf.com/content_cvpr_2018" + title0[17:]
    38             req = requests.get(chaolianjie, headers=self.headers)
    39             req.encoding = req.apparent_encoding
    40             onehtml = req.text
    41             selector1 = etree.HTML(req.text)
    42             title = selector1.xpath('//div[@id="papertitle"]/text()')
    43             # print(title[0][1:])
    44             abst = selector1.xpath('//div[@id="abstract"]/text()')
    45             hre0 = selector1.xpath('//a/@href')
    46             hre = "https://openaccess.thecvf.com" + hre0[5][5:]
    47             # print(hre)
    48             author = selector1.xpath('//dd/div[@id="authors"]/b/i/text()')
    49             va = []
    50             for keyword, weight in extract_tags(abst[0].strip(), topK=1, withWeight=True):
    51                 print('%s %s' % (keyword, weight))
    52             va.append(title)
    53             va.append(hre)
    54             va.append(abst)
    55             va.append(author)
    56             va.append("2018-06-19")
    57             va.append(keyword)
    58             sql = "insert into cvpr1 (title,link,abstract,author,time,keyword) values (%s,%s,%s,%s,%s,%s)"
    59             #sql = "insert into lunwen (title,abstract,link,keyword) values (%s,%s,%s,%s)"
    60             #cursor.execute(sql, va)
    61             cursor.execute(sql, va)
    62             db.commit()
    63 
    64             print("已爬取" + str(number) + "条数据")
    65             number = number + 1
    66 
    67         end = time.time()
    68         print('共耗时:', end - start)
    69 
    70 
    71 if __name__ == '__main__':
    72     spider = Spider()
    73     spider.lxml_find()
    74     cursor.close()
    75     db.close()

  • 相关阅读:
    WebAssembly是什么?
    转《18个实时音视频开发中会用到开源项目》
    Windows Server服务器之介绍及版本信息
    软件开发方法
    SLA(服务等级协议)
    Linux系统之-介绍,主要特性
    Linux系统之-文件系统,桌面环境
    Linux系统之-常用命令及技巧
    网站运维都需要做什么工作
    Oracle Minus关键字
  • 原文地址:https://www.cnblogs.com/znjy/p/14884125.html
Copyright © 2020-2023  润新知