import requests import re import time from bs4 import BeautifulSoup import csv import xlrd from xlutils.copy import copy import random ##屏蔽https错误 requests.packages.urllib3.disable_warnings() class Spider(): def __init__(self,keyworks): self.kw = keyworks self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Cookie":"D3z_vi-ds=f1f6d61ffd02c29c1cd832a363888be3; __jsluid_s=0b360d705e0e333a682280ae3b03bf90; Hm_lvt_c909c1510b4aebf2db610b8d191cbe91=1655284406; Hm_lpvt_c909c1510b4aebf2db610b8d191cbe91=1655285546", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" } self.csv_name = "123.csv" self.url = "https://www.dlzb.com/zb/search.php?kw="+str(self.kw) self.path = "123.xls" def getContent(self,nextUrl): res = requests.get(nextUrl, headers=self.headers, verify=False) #https访问 return res pass #写入csv def writeXlx(self,title,kw): data = [title,kw] f = open(self.csv_name, 'a+', newline='', encoding='utf-8') # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # csv_writer.writerow(["作者", '摘要']) # 3. 构建列表头 csv_writer.writerow(data) f.close() pass def getPage(self,p,total): for num in range(total): # if num == 1: # break nextUrl = self.url+"&page="+str(num+p) print(nextUrl) #获取内容 content = self.getContent(nextUrl) #print(content.text) # re.S匹配多行,包括换行符\n res = re.findall(r'<ul class=\"gclist_ul listnew\">(.*?)<\/ul>', content.text, re.S) if res: titleList = re.findall(r'<a class=\"gccon_title\".*?>(.*?)<\/a>', res[0], re.S) data = [] for title in titleList: soup = BeautifulSoup(title, 'html.parser')#过滤html标签 print(str(self.kw) +"-第"+str(num+p)+"页"+str(),soup.get_text()) data.append([soup.get_text(),self.kw]) self.writeXLSAppend(data) time.sleep(random.uniform(1, 10)) else: print(self.kw,"end*************") print(content.text) print(res) break pass def writeXLSAppend(self,value): index = len(value) # 获取需要写入数据的行数 workbook = xlrd.open_workbook(self.path) # 打开工作簿 sheets = workbook.sheet_names() # 获取工作簿中的所有表格 worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格 for i in range(0, index): for j in range(0, len(value[i])): new_worksheet.write(i + rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入 new_workbook.save(self.path) # 保存工作簿 print("xls格式表格【追加】写入数据成功!") if __name__ == '__main__': #https://www.dlzb.com/zb/search.php?kw=%E6%99%BA%E8%83%BD%E5%AE%A2%E6%9C%8D list = ['人工智能','知识图谱','计算机视觉','图像识别','文本挖掘','文本分析','知识问答','神经网络'] for x in list: print(x) ##页数default=1,多少条(500) spider = Spider(x).getPage(1,1000) ##