• 药物不良反应数据库信息的下载


    需求:如图,我想把不良反应数据库中的每个药品的不良反应相关信息给获取到

    点击详细信息之后

    分析页面请求,发现是ajax请求,

    •   第一步,我们需要获取详细页面的url,也就是药品ID
    •   第二步,拿到详细页面的url,下载页面
    •   第三步,提取页面中的适应症和不良反应,并将数据写入文件

    代码

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2019/1/11
    @Author: Zhang Yafei
    """
    import json
    import numpy
    import os
    
    from gevent import monkey
    monkey.patch_all()
    import gevent
    from urllib.parse import urljoin
    import pandas as pd
    import requests
    from concurrent.futures import ThreadPoolExecutor
    from lxml.etree import HTML
    
    
    url_list = []
    drug_list = []
    
    
    def task(page):
        origin_url = 'http://pharm.ncmi.cn/dataContent/dataSearch.do?did=6'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
        data = {
            'method': 'list',
            'ec_i': 'ec',
            'ec_crd': 200,
            'ec_p': page+1,
            'ec_rd': 200,
            'ec_pd': page,
        }
        response = requests.post(origin_url, headers=headers, data=data)
        return response
    
    
    def done(future,*args,**kwargs):
        response = future.result()
        response = HTML(response.text)
        hrefs = response.xpath('//table[@id="ec_table"]//tr/td[4]/a/@href')[1:]
        for href in hrefs:
            detail_url = urljoin('http://pharm.ncmi.cn', 'dataContent/' + href)
            url_list.append(detail_url)
    
    
    def main():
        origin_url = 'http://pharm.ncmi.cn/dataContent/dataSearch.do?did=6'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
        data = {
            'method': 'list',
            'ec_i': 'ec',
            'ec_crd': 200,
            'ec_p': 1,
            'ec_rd': 200,
            'ec_pd': 0,
        }
        response = requests.post(origin_url, headers=headers, data=data)
        response = HTML(response.text)
        hrefs = response.xpath('//table[@id="ec_table"]//tr/td[4]/a/@href')[1:]
        url_list = []
        for href in hrefs:
            # http://pharm.ncmi.cn/dataContent/dataSearch.do?method=viewpage&id=145511&did=6
            # http: // pharm.ncmi.cn / dataSearch.do?method = viewpage & id = 144789 & did = 6
            detail_url = urljoin('http://pharm.ncmi.cn','dataContent/'+href)
            url_list.append(detail_url)
        list(map(parse, url_list))
    
    
    def parse(file):
        with open(file=file, encoding='utf-8') as f:
            response = f.read()
        response = HTML(text=response)
        drug_name = response.xpath('//form/table[1]//table/tr[3]/td[2]/text()')[0].strip()
        adverse_reaction = response.xpath('//form/table[1]//table/tr[9]/td[2]/text()')[0].strip()
        indiction = response.xpath('//form/table[1]//table/tr[last()-1]/td[2]/text()')[0].strip()
        if not indiction:
            indiction = numpy.NAN
        drug_dict = {
            '药品通用名称': drug_name,
            '不良反应':adverse_reaction,
            '适应症': indiction,
        }
        drug_list.append(drug_dict)
        print(file+'提取成功')
    
    
    def task1(i, url):
        response = requests.get(url)
        filename = 'html/{}.html'.format(i)
        if not os.path.exists(filename):
            with open(filename,'w',encoding='utf-8') as f:
                f.write(response.text)
    
    
    if __name__ == '__main__':
        # 1.获取所有url
        # pool = ThreadPoolExecutor()
        # for page in range(37):
        #     v = pool.submit(task, page)
        #     v.add_done_callback(done)
        #
        # pool.shutdown(wait=True)
        # 2.将url写入文件
        # with open('url.py','w') as f:
        #     json.dump(url_list, f)
    
        # 3.读取url并下载页面
        # with open('url.py') as f:
        #     url_list = json.load(f)
        # pool = ThreadPoolExecutor()
        # for i, url in enumerate(url_list):
        #     v = pool.submit(task1, i, url)
        #
        # pool.shutdown(wait=True)
    
        # 4.读取页面提取有用信息,并写入文件
        for base_path, folders, files in os.walk('html'):
            file_list = list(map(lambda x:os.path.join(base_path, x), files))
            # list(map(parse, file_list))
        pool = ThreadPoolExecutor()
        for file in file_list:
            v = pool.submit(parse, file)
    
        pool.shutdown(wait=True)
    
        df = pd.DataFrame(data=drug_list)
        df = df.loc[:, ['药品通用名称','适应症','不良反应']]
        writer = pd.ExcelWriter('adverse_reaction_database.xlsx')
        df.to_excel(writer, 'adverse_reaction', index=False)
        writer.save()
    

      

  • 相关阅读:
    刷题力扣面试题 10.03. 搜索旋转数组
    刷题力扣面试题 10.05. 稀疏数组搜索
    刷题力扣面试题 08.10. 颜色填充
    刷题力扣面试题 10.11. 峰与谷
    刷题力扣面试题 10.09. 排序矩阵查找
    刷题力扣面试题 08.13. 堆箱子
    刷题力扣面试题 08.11. 硬币
    刷题力扣面试题 10.02. 变位词组
    刷题力扣面试题 16.02. 单词频率
    Windows下phpstudy安装PHP扩展yaml
  • 原文地址:https://www.cnblogs.com/zhangyafei/p/10266642.html
Copyright © 2020-2023  润新知