Python爬虫爬取ECVA论文标题、作者、链接

  1 import re
  2 import requests
  3 from bs4 import BeautifulSoup
  4 import lxml
  5 import traceback
  6 import time
  7 import json
  8 from lxml import etree
  9 def get_paper():
 10     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php
 11     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/283_ECCV_2020_paper.php
 12     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
 13     url='https://www.ecva.net/papers.php'
 14     headers = {
 15         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
 16     }
 17     response=requests.get(url,headers)
 18     response.encoding='utf-8'
 19     page_text=response.text
 20     #输出页面html
 21     # print(page_text)
 22     soup = BeautifulSoup(page_text,'lxml')
 23     all_dt=soup.find_all('dt',class_='ptitle')
 24     #暂存信息
 25     temp_res=[]
 26     #最后结果集
 27     res=[]
 28     #链接
 29     link_res = []
 30     for dt in all_dt:
 31         single_dt=str(dt)
 32         single_soup=BeautifulSoup(single_dt,'lxml')
 33         title=single_soup.find('a').text
 34         #存标题
 35         temp_res.append(title)
 36         #存摘要
 37 
 38         #存关键字
 39 
 40         #存源链接
 41         sourcelink=single_soup.find('a')['href']
 42         sourcelink="https://www.ecva.net/"+sourcelink
 43         temp_res.append(sourcelink)
 44         res.append(temp_res)
 45         temp_res=[]
 46     #爬取作者和pdf文件链接
 47     all_dd=soup.find('div',id='content')
 48     all_dd=all_dd.find_all('dd')
 49     flag=0
 50     author=[]
 51     download=[]
 52     pdfinfo=[]
 53     for item in all_dd:
 54         if(flag%2==0):
 55             #保存作者
 56             author.append(item)
 57         else:
 58             linktext=str(item)
 59             linksoup=BeautifulSoup(linktext,'lxml')
 60             link_res.append(linksoup.find_all('div',class_='link2'))
 61             #解析download 和 pdfinfo
 62         flag = flag + 1
 63     """
 64     继续使用beautifulsoup
 65     download_text 和 pdfinfo_text
 66     存储author
 67     "https://www.ecva.net/"
 68     """
 69     linkflag=1
 70     print(len(link_res))
 71     for items in link_res:
 72         for item in items:
 73             if(linkflag%2==0):
 74                 pdfinfo_text = str(item)
 75             else:
 76                 download_text = str(item)
 77             linkflag=linkflag+1
 78         download_text_soup=BeautifulSoup(download_text,'lxml')
 79         pdfinfo_text_soup=BeautifulSoup(pdfinfo_text,'lxml')
 80         #解析两个链接
 81         download.append("https://www.ecva.net/"+download_text_soup.find('a')['href'])
 82         pdfinfo.append(pdfinfo_text_soup.find('a')['href'])
 83     print(len(download))
 84     print(len(pdfinfo))
 85     # for item in download :
 86     #     print(item)
 87     print("------------------------------")
 88 
 89     #把作者和download pdfinfo 存到res
 90     for i in range(0,len(res)):
 91         #添加作者
 92         res[i].append(author[0])
 93         #添加download
 94         res[i].append(download[0])
 95         #添加pdfinfo
 96         res[i].append(pdfinfo[0])
 97     #遍历最终结果集
 98     print(res[0])
 99     # for item in res:
100     #     print(item)
101     return
102 
103 if (__name__=='__main__'):
104     get_paper()

相关阅读:
MySql的常用命令
 yum命令配置及使用说明和常见问题处理
 oracle12c创建用户和表空间出现的问题
 oracle云部署
 ORA-12154: TNS:could not resolve the connect identifier specified
Linux之iptables
Linux之MySQL
Linux之apache
oracle查锁表
 cookie 和 HttpSession
原文地址：https://www.cnblogs.com/rainbow-1/p/14876076.html