• 爬取药智网中的方剂信息


    为了防止爬虫过程过快,被限制ip所以每次爬取完一个页面,就休眠6秒

    初学爬虫,写的有点简单

    # -*- coding: utf-8 -*-
    from bs4 import BeautifulSoup
    import requests
    import csv
    import time
    
    ExcelName = "F:/大学/毕业设计/资料文档/方剂.csv"
    #写入表头
    # with open(ExcelName, 'w', encoding='utf-8', newline='') as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(["方名","出处","功用大类" ,"功用小类","处方","炮制","功用","主治","附方"])
    
    def get_contents(ulist, url):
        headers = {  # 假装自己是浏览器
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
            # 把你刚刚拿到的Cookie塞进来
            'Cookie': 'kztoken=nJail6zJp6iXaJqWmGpnZmlwYZyZ; his=a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYZya%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpeX%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpmU%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpqS%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpuU%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5aV%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5aa%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5mX%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlvaZaU%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlwYZyZ%22%3B%7D; bigdata_use_tips=1; PHPSESSID=iiiqpops4jemgoh33rbrkqhue5; yaozh_logintime=1615682156; yaozh_user=1026728%09%E4%B8%80%E5%BE%80%E6%97%A0%E5%89%8Dgy; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyYnoaSZ5drnJadg26qb21rg66flM6bh5%2BscZJsbIVJGuFJIuEd%2FNVK7fLIrFlwq2uac1OfwqnZw62gzp1Unti163E4711aE449B15f37E26dF531cDF2DckpSeg2ibZpmdlpVpaGpabNRzZW2Dqs7Rnlmcq2yUmJyDlZqSbJttl5Wammhqalps3g%3D%3D0fc4e597aa9b7a0a8b55788b6dfd7894; _ga=GA1.2.2493188.1609388760; _gid=GA1.2.1909203093.1615682102; kztoken=nJail6zJp6iXaJqWmGpnZmlsZJuU; his=a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqaZyU%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqaZyb%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapSU%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapSa%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapWX%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapib%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapqY%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlrYZmb%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlrZJaS%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlsZJuU%22%3B%7D; zhuce_show=true; acw_tc=2f624a1716156878327574920e31b8726ca5960ab6c9d6b0f869dc5e312a44; think_language=zh-CN; _ga=GA1.3.165986868.1609388536; _gid=GA1.3.1909203093.1615682102; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1615467255,1615682102,1615682160,1615683332; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1615688097', }
        session = requests.Session()
        response = session.get(url, headers=headers)
        response.encoding = 'UTF-8'
        html = response.text  # 将网页内容以html返回
        soup = BeautifulSoup(html, 'lxml')  # 解析网页的一种方法
        trs = soup.find_all('tr')
    
        方名 = ""
        出处 = ""
        功用大类 = ""
        功用小类 = ""
        处方 = ""
        炮制 = ""
        功用 = ""
        主治 = ""
        附方 = ""
    
        for tr in trs:
    
            for td in tr:
                if td.string == '方名':
                    spans = tr.find('span')
                    方名 = spans.get_text().split('}')[1]
                    spans2 = tr.find('span')
                    print(方名)
    
                if td.string == "出处":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    出处 = spans.get_text().split('}')[1]
    
    
                if td.string == "功用大类":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    功用大类 = spans.get_text().split('}')[1]
    
                if td.string == "功用小类":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    功用小类 = spans.get_text().split('}')[1]
    
                if td.string == "处方":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    处方 = spans.get_text().split('}')[1]
    
                if td.string == "炮制":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    炮制 = spans.get_text().split('}')[1]
    
                if td.string == "功用":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    功用 = spans.get_text().split('}')[1]
    
                if td.string == "主治":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    主治 = spans.get_text().split('}')[1]
    
                if td.string == "附方":
                    spans = tr.find('span')
                    spans2 = tr.find('span')
    
                    附方 = spans.get_text().split('}')[1]
            #ulist.append(ui)
        with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([方名,出处,功用大类 ,功用小类,处方,炮制,功用,主治,附方])
    
        # 保存资源
    
    
    def main():
    
        urli = []
        for i in range(1600,2000):
            url = f"https://db.yaozh.com/fangji/{i+10000000}.html"
    
            print ("开始爬取")
            get_contents(urli, url)
            print("开始保存")
            time.sleep(6)
    
    main()
  • 相关阅读:
    事务创建函数
    实现Xshell断开连接情况下Linux命令继续执行
    MySQL UNION 操作符
    CentOS安装部署Mysql 5.7
    连接数据库
    @Results用法总结
    在Java中如何高效的判断数组中是否包含某个元素
    Java中的map集合顺序如何与添加顺序一样
    instanceof的用法
    枚举
  • 原文地址:https://www.cnblogs.com/1gaoyu/p/14533797.html
Copyright © 2020-2023  润新知