• 寻医问药 爬虫


    import requests
    import re
    import pandas as pd
    
    def get_all_date_url():
        all_url=[]
        for i in range(61):
            url = 'http://club.xywy.com/keshi/{}.html'.format(str(i+1))
            res = requests.get(url)
            urls=re.findall(r"http://club.xywy.com/keshi/d{4}-d{2}-d+/d+.html",res.text)
            all_url.extend(urls)
        return list(set(all_url))
    
    def get_QA_url(url):
        all_QA_url=[]
        res = requests.get(url)
        res.encoding = 'gb2312'
        all_page = re.findall(r'共 (d+) 页',res.text)[0]
        for i in range(int(all_page)):
            url1 = 'http://club.xywy.com/keshi/'+ url.split('/')[-2] + '/' + str(i+1) +'.html'
            all_QA_url.append(url1)
        return list(set(all_QA_url))
    
    def main():
        all_url_data = []
        for i in get_all_date_url():
            all_url_data.extend(get_QA_url(i))
    
        info_list = []
        for detail_url in all_url_data:
            final_dic_data = {}
            final_dic_data['url']=detail_url
            final_dic_data['患者标题']=xx
            final_dic_data['患者姓名']=xx
            final_dic_data['患者性别']=xx
            final_dic_data['提问日期']=xx
            final_dic_data['患者描述']=xx
            final_dic_data['医生姓名']=xx
            final_dic_data['医生职称']=xx
            final_dic_data['医生科室']=xx
            final_dic_data['问题分析']=xx
            final_dic_data['回答时间']=xx
            info_list.append(final_dic_data)
    
        df =pd.DataFrame(info_list)
        df.to_excel('xunyiwenyao.xlsx',index=False)
    
    if __name__ == '__main__':
        mian()
  • 相关阅读:
    linux 下查看网速的方法 (不需要安装任何软件)
    Raspberry Pi Kernel Compilation 内核编译官方文档
    Kernel compiling for Pi 2
    从源码编译rpi的内核
    设备驱动调试和移植的一般方法
    爸爸的歌
    表扬?批评?
    日历插件js,jquery
    zepto jquery和zepto的区别?
    怎么学习PS快?
  • 原文地址:https://www.cnblogs.com/Erick-L/p/8311825.html
Copyright © 2020-2023  润新知