• 爬虫案例之Pubmed数据库下载


    代码

    # encoding=utf-8
    import os, time, re
    import urllib.request
    import urllib.parse
    import ssl
    
    ssl._create_default_https_context = ssl._create_unverified_context
    retmax = 500
    FAILURE = 0
    SUCCESS = 1
    startNum = 1
    
    BASE = 'NARA'
    FILES= ['Losartan','Valsartan','Irbesartan','Eprosartan','Candesartan','Telmisartan','Olmesartan']
    
    # BASE = 'Triptans'
    # FILES = ['Sumatriptan','Zolmitriptan',
    #          'Naratriptan','Rizatriptan','Almotriptan',
    #          'Frovatriptan','Eletriptan']
    
    
    if not os.path.exists(BASE):
        os.mkdir(BASE)
    
    
    def lastline(fd):
        lastNum = 0
        print(fd)
        if not os.path.isfile(fd):
            f_check = open(fd, 'w')
            f_check.close()
        f = open(fd, 'r')
        lines = f.readlines()
        f.close()
        if lines:
            lastNum = int(lines[-1].strip())
        return lastNum
    
    
    def Download(drug, sleep_time, query_key, webenv, endNum):
        lastNum = lastline('{}/{}/checkpoint.txt'.format(BASE,drug))
        if lastNum == 0:
            start = startNum - 1
        else:
            start = lastNum + retmax
        # print(lastNum)
        f_append = open('{}/{}/checkpoint.txt'.format(BASE, drug), 'a')
        f_error = open('{}/{}/error.txt'.format(BASE, drug), 'a')
        try:
            for retstart in range(start, endNum, retmax):
                time.sleep(sleep_time)
                print('	downloading: %d - %d' % (retstart + 1, retstart + retmax))
                urllib.request.urlretrieve('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
                                           'db=pubmed&query_key=%s&WebEnv=%s&retstart=%s&retmax=%s&retmode=xml' % (
                                               query_key, webenv, retstart, retmax),
                                           '%s/%s/%d-%d.xml' % (BASE, drug, retstart + 1, retstart + retmax))
    
                f_append.write('%d
    ' % (retstart))
                f_append.flush()
        except Exception as ex:
            print(ex)
            # print('		bad connection!')
            raise Exception()
            return FAILURE
        finally:
            f_append.close()
            f_error.close()
    
        print('Downloading  is done........................')
        return SUCCESS
    
    
    def Download_auto(fun, drug, query_key, webenv, endNum, sleep_time=5):
        while True:
            try:
                value = fun(drug, sleep_time, query_key, webenv, endNum)
                if value == SUCCESS:
                    break
            except Exception as e:
                sleep_time += 5
                print('prolong sleep time:', sleep_time)
    
    
    def main(drug):
        """主函数"""
        if not os.path.exists('{}/{}'.format(BASE,drug)):
            os.mkdir('{}/{}'.format(BASE,drug))
        query = '%s[TIAB]+OR+%s[MH]' % (drug, drug)
        url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}&usehistory=y'.format(query)
        history = urllib.request.urlopen(url)
        content = history.read().decode()
        pattern = re.compile('<Count>(d+)</Count>.*<QueryKey>(d+)</QueryKey>.*<WebEnv>(S+)</WebEnv>')
        s = pattern.search(content)
        count = s.group(1)
        query_key = s.group(2)
        webenv = s.group(3)
        print('total counts: %s' % count)
        endNum = int(count)
        print(endNum)
        Download_auto(Download,drug, query_key, webenv, endNum)
    
    
    if __name__ == '__main__':
        start = time.time()
        list(map(main, FILES))
        print(time.time() - start)
    

      

  • 相关阅读:
    推荐算法学习资料
    imsdroid 学习(初认识)
    从网易新闻看离线阅读的实现思路
    关于PullToRefreshView bug 的修复
    Android Log日志的封装类,显示类名以及行号,快速定位
    Android Sqlite数据库版本升级管理初探
    《围观啦》发布了!!!!!!!
    单本书阅读,android客户端
    Android P2P语音通话实现(思路探讨)
    HTTP协议基础
  • 原文地址:https://www.cnblogs.com/zhangyafei/p/10266502.html
Copyright © 2020-2023  润新知