• Python爬虫爬取ECVA论文标题、作者、链接


      1 import re
      2 import requests
      3 from bs4 import BeautifulSoup
      4 import lxml
      5 import traceback
      6 import time
      7 import json
      8 from lxml import etree
      9 def get_paper():
     10     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php
     11     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/283_ECCV_2020_paper.php
     12     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
     13     url='https://www.ecva.net/papers.php'
     14     headers = {
     15         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
     16     }
     17     response=requests.get(url,headers)
     18     response.encoding='utf-8'
     19     page_text=response.text
     20     #输出页面html
     21     # print(page_text)
     22     soup = BeautifulSoup(page_text,'lxml')
     23     all_dt=soup.find_all('dt',class_='ptitle')
     24     #暂存信息
     25     temp_res=[]
     26     #最后结果集
     27     res=[]
     28     #链接
     29     link_res = []
     30     for dt in all_dt:
     31         single_dt=str(dt)
     32         single_soup=BeautifulSoup(single_dt,'lxml')
     33         title=single_soup.find('a').text
     34         #存标题
     35         temp_res.append(title)
     36         #存摘要
     37 
     38         #存关键字
     39 
     40         #存源链接
     41         sourcelink=single_soup.find('a')['href']
     42         sourcelink="https://www.ecva.net/"+sourcelink
     43         temp_res.append(sourcelink)
     44         res.append(temp_res)
     45         temp_res=[]
     46     #爬取作者和pdf文件链接
     47     all_dd=soup.find('div',id='content')
     48     all_dd=all_dd.find_all('dd')
     49     flag=0
     50     author=[]
     51     download=[]
     52     pdfinfo=[]
     53     for item in all_dd:
     54         if(flag%2==0):
     55             #保存作者
     56             author.append(item)
     57         else:
     58             linktext=str(item)
     59             linksoup=BeautifulSoup(linktext,'lxml')
     60             link_res.append(linksoup.find_all('div',class_='link2'))
     61             #解析download 和 pdfinfo
     62         flag = flag + 1
     63     """
     64     继续使用beautifulsoup
     65     download_text 和 pdfinfo_text
     66     存储author
     67     "https://www.ecva.net/"
     68     """
     69     linkflag=1
     70     print(len(link_res))
     71     for items in link_res:
     72         for item in items:
     73             if(linkflag%2==0):
     74                 pdfinfo_text = str(item)
     75             else:
     76                 download_text = str(item)
     77             linkflag=linkflag+1
     78         download_text_soup=BeautifulSoup(download_text,'lxml')
     79         pdfinfo_text_soup=BeautifulSoup(pdfinfo_text,'lxml')
     80         #解析两个链接
     81         download.append("https://www.ecva.net/"+download_text_soup.find('a')['href'])
     82         pdfinfo.append(pdfinfo_text_soup.find('a')['href'])
     83     print(len(download))
     84     print(len(pdfinfo))
     85     # for item in download :
     86     #     print(item)
     87     print("------------------------------")
     88 
     89     #把作者和download pdfinfo 存到res
     90     for i in range(0,len(res)):
     91         #添加作者
     92         res[i].append(author[0])
     93         #添加download
     94         res[i].append(download[0])
     95         #添加pdfinfo
     96         res[i].append(pdfinfo[0])
     97     #遍历最终结果集
     98     print(res[0])
     99     # for item in res:
    100     #     print(item)
    101     return
    102 
    103 if (__name__=='__main__'):
    104     get_paper()

     

  • 相关阅读:
    从通胀说起
    科技见欲迷人眼
    吃货在西安 之 粉丝羊血泡馍
    祝母亲大人福如东海长流水,寿比南山不老松
    久违的蓝调北京
    调和生活前的问题
    《N2CMS实例教程》第四讲:Article Template Page
    《N2CMS实例教程》前言
    《N2CMS实例教程》第一讲:开发环境
    Microsoft Sync Framework 学习实例1文件同步
  • 原文地址:https://www.cnblogs.com/rainbow-1/p/14876076.html
Copyright © 2020-2023  润新知