为了防止爬虫过程过快,被限制ip所以每次爬取完一个页面,就休眠6秒
初学爬虫,写的有点简单
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests import csv import time ExcelName = "F:/大学/毕业设计/资料文档/方剂.csv" #写入表头 # with open(ExcelName, 'w', encoding='utf-8', newline='') as csvfile: # writer = csv.writer(csvfile) # writer.writerow(["方名","出处","功用大类" ,"功用小类","处方","炮制","功用","主治","附方"]) def get_contents(ulist, url): headers = { # 假装自己是浏览器 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0', # 把你刚刚拿到的Cookie塞进来 'Cookie': 'kztoken=nJail6zJp6iXaJqWmGpnZmlwYZyZ; his=a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYZya%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpeX%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpmU%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpqS%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpuU%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5aV%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5aa%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5mX%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlvaZaU%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlwYZyZ%22%3B%7D; bigdata_use_tips=1; PHPSESSID=iiiqpops4jemgoh33rbrkqhue5; yaozh_logintime=1615682156; yaozh_user=1026728%09%E4%B8%80%E5%BE%80%E6%97%A0%E5%89%8Dgy; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyYnoaSZ5drnJadg26qb21rg66flM6bh5%2BscZJsbIVJGuFJIuEd%2FNVK7fLIrFlwq2uac1OfwqnZw62gzp1Unti163E4711aE449B15f37E26dF531cDF2DckpSeg2ibZpmdlpVpaGpabNRzZW2Dqs7Rnlmcq2yUmJyDlZqSbJttl5Wammhqalps3g%3D%3D0fc4e597aa9b7a0a8b55788b6dfd7894; _ga=GA1.2.2493188.1609388760; _gid=GA1.2.1909203093.1615682102; kztoken=nJail6zJp6iXaJqWmGpnZmlsZJuU; his=a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqaZyU%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqaZyb%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapSU%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapSa%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapWX%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapib%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapqY%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlrYZmb%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlrZJaS%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlsZJuU%22%3B%7D; zhuce_show=true; acw_tc=2f624a1716156878327574920e31b8726ca5960ab6c9d6b0f869dc5e312a44; think_language=zh-CN; _ga=GA1.3.165986868.1609388536; _gid=GA1.3.1909203093.1615682102; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1615467255,1615682102,1615682160,1615683332; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1615688097', } session = requests.Session() response = session.get(url, headers=headers) response.encoding = 'UTF-8' html = response.text # 将网页内容以html返回 soup = BeautifulSoup(html, 'lxml') # 解析网页的一种方法 trs = soup.find_all('tr') 方名 = "" 出处 = "" 功用大类 = "" 功用小类 = "" 处方 = "" 炮制 = "" 功用 = "" 主治 = "" 附方 = "" for tr in trs: for td in tr: if td.string == '方名': spans = tr.find('span') 方名 = spans.get_text().split('}')[1] spans2 = tr.find('span') print(方名) if td.string == "出处": spans = tr.find('span') spans2 = tr.find('span') 出处 = spans.get_text().split('}')[1] if td.string == "功用大类": spans = tr.find('span') spans2 = tr.find('span') 功用大类 = spans.get_text().split('}')[1] if td.string == "功用小类": spans = tr.find('span') spans2 = tr.find('span') 功用小类 = spans.get_text().split('}')[1] if td.string == "处方": spans = tr.find('span') spans2 = tr.find('span') 处方 = spans.get_text().split('}')[1] if td.string == "炮制": spans = tr.find('span') spans2 = tr.find('span') 炮制 = spans.get_text().split('}')[1] if td.string == "功用": spans = tr.find('span') spans2 = tr.find('span') 功用 = spans.get_text().split('}')[1] if td.string == "主治": spans = tr.find('span') spans2 = tr.find('span') 主治 = spans.get_text().split('}')[1] if td.string == "附方": spans = tr.find('span') spans2 = tr.find('span') 附方 = spans.get_text().split('}')[1] #ulist.append(ui) with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow([方名,出处,功用大类 ,功用小类,处方,炮制,功用,主治,附方]) # 保存资源 def main(): urli = [] for i in range(1600,2000): url = f"https://db.yaozh.com/fangji/{i+10000000}.html" print ("开始爬取") get_contents(urli, url) print("开始保存") time.sleep(6) main()