• NIPS2020 论文下载 代码


      1 # %% NIPS 2020 论文信息下载
      2 import json
      3 import os
      4 import re
      5 
      6 import pandas as pd
      7 import requests
      8 import tqdm
      9 from bs4 import BeautifulSoup
     10 
     11 
     12 os.chdir(os.path.dirname(os.path.abspath(__file__)))
     13 
     14 # %%
     15 PAPER_HASH_PATTERN = re.compile(r'poster_(?P<UID>w+).html')
     16 SESSION_PATTERN = re.compile(r'Orals & Spotlights Track d+:s*(?P<session>[^;]*)')
     17 
     18 
     19 def cleanup_string(s):
     20     s = s.strip()
     21     while '  ' in s:
     22         s = s.replace('  ', ' ')
     23     return s
     24 
     25 
     26 def download_file(download_url, file_name=None):
     27     if file_name is None:
     28         file_name = os.path.basename(download_url)
     29     response = requests.get(download_url, stream=True)
     30     total = int(response.headers.get('Content-Length'))
     31     pbar = None
     32     if total is not None:
     33         pbar = tqdm.tqdm(desc=f'Downloading from {download_url} to {file_name}',
     34                          total=total, unit='B', unit_scale=True, unit_divisor=1000)
     35     with open(file_name, 'wb') as file:
     36         for chunk in response.iter_content(chunk_size=10240):
     37             if chunk:
     38                 file.write(chunk)
     39             if pbar is not None:
     40                 pbar.update(len(chunk))
     41 
     42 
     43 # %%
     44 # download paper list
     45 if not os.path.exists('papers.json'):
     46     download_file('https://neurips.cc/virtual/2020/public/papers.json', file_name='papers.json')
     47 
     48 # %%
     49 # get oral paper list
     50 oral_papers = set()
     51 response = requests.get('https://neurips.cc/virtual/2020/public/f_orals.html')
     52 soup = BeautifulSoup(response.text, 'html.parser')
     53 for tag in soup.find_all('a', href=PAPER_HASH_PATTERN):
     54     href = tag['href']
     55     UID = PAPER_HASH_PATTERN.search(href).group('UID')
     56     oral_papers.add(UID)
     57 
     58 # %%
     59 # process paper list
     60 with open('papers.json', mode='r') as file:
     61     data = json.load(file)
     62 
     63 df = pd.DataFrame(columns=['ID', 'Category', 'Title', 'Authors', 'Keywords', 'Sessions', 'URL', 'Proceedings URL', 'PDF URL', 'UID'])
     64 for i, paper in enumerate(tqdm.tqdm(data)):
     65     if paper['eventtype'] != 'Poster':
     66         continue
     67 
     68     UID = paper['UID']
     69     category = 'Poster'
     70     sessions = '; '.join(paper['sessions'])
     71     sessions = '; '.join([match.group('session') for match in SESSION_PATTERN.finditer(sessions)])
     72     sessions = cleanup_string(sessions)
     73     if sessions != '':
     74         category = 'Spotlight'
     75     if UID in oral_papers:
     76         category = 'Oral'
     77 
     78     keywords = set()
     79     for keyword in ('; '.join(paper['keywords'])).split('; '):
     80         keyword = cleanup_string(keyword)
     81         if keyword != '':
     82             keywords.add(keyword)
     83     keywords = '
    '.join(sorted(keywords))
     84 
     85     paper = {
     86         'ID': paper['id'],
     87         'Category': category,
     88         'Title': cleanup_string(paper['title']),
     89         'Authors': cleanup_string(', '.join(paper['authors'])),
     90         'Keywords': keywords,
     91         'Sessions': sessions,
     92         'URL': f'https://neurips.cc/virtual/2020/public/poster_{UID}.html',
     93         'Proceedings URL': paper['paper_pdf_url'],
     94         'PDF URL': f'https://proceedings.neurips.cc/paper/2020/file/{UID}-Paper.pdf',
     95         'UID': UID
     96     }
     97     df.loc[len(df)] = paper
     98 
     99 df['Category'] = pd.Categorical(df['Category'], categories=['Oral', 'Spotlight', 'Poster'])
    100 df.sort_values(by=['Category', 'Sessions', 'Keywords'], inplace=True)
    101 df.to_csv('paper_list.csv', index=False)
    102 
    103 # %%
    104 # get paper details
    105 all_subject_areas = set()
    106 for i, paper in enumerate(tqdm.tqdm(df.iloc, total=len(df))):
    107     if paper['Keywords'] == '':
    108         continue
    109     areas = set(paper['Keywords'].split('
    '))
    110     all_subject_areas.update(areas)
    111 
    112 try:
    113     all_subject_areas.remove('')
    114 except KeyError:
    115     pass
    116 
    117 df = df.reindex(columns=df.columns.to_list() + sorted(all_subject_areas))
    118 for i, paper in enumerate(df.iloc):
    119     for area in paper['Keywords'].split('
    '):
    120         if area != '':
    121             df[area][i] = 'Y'
    122 
    123 df.to_csv('NeuraIPS Papers.csv', index=False)
  • 相关阅读:
    BitSet源码
    BitSet
    webrtc在ubuntu14.04上的编译过程(12.04亦可)
    使用 ssh -R 建立反向/远程TCP端口转发代理
    爬虫与反爬虫
    Linux IO模式及 select、poll、epoll详解
    PF_RING 总结
    40行代码的人脸识别实践
    初学者必读:IBM长文解读人工智能、机器学习和认知计算
    C 格式化显示时间(time.h)
  • 原文地址:https://www.cnblogs.com/imoon22/p/14255581.html
Copyright © 2020-2023  润新知