import requests import urllib.request as ur from bs4 import BeautifulSoup import csv import threading class MovieHeven(): def __init__(self): self.url="https://www.dytt8.net/html/gndy/dyzz/index.html" self.page=1 self.No=1 self.fobj=open("movies.csv", "wt", encoding="gbk", newline='') def spider(self): try: print("正在爬取第{}页...".format(self.page)) # time.sleep(1) #获取网页链接并读取 html = requests.get(self.url)#.Session() html.encoding="gbk" html=html.text #beautfulSoup装载文档 root=BeautifulSoup(html,"lxml") #查找所需元素,获取tables列表 tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table") for table in tables: name = table.find("a").text href = "http://www.dytt8.net"+table.find("a")["href"] # 文件写入操作 writer = csv.writer(self.fobj) writer.writerow([name, href]) print("No:", self.No, name, href) self.No += 1 # time.sleep(1) urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a") #寻找下一页的链接 for u in urls: if u.text == "下一页":#如有下一页 self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"] print(self.url) self.page += 1 self.spider()#爬取下一页 # except:#没有下一页 # print("finished") # spider(url) except Exception as err: print(err) def main(self): ## threading.Thread(target=spiderA(url)).start() import time begin_time = time.time() self.spider() # 执行主程序 self.fobj.close() end_time = time.time() time = end_time - begin_time m, s = divmod(round(time), 60) print("用时:{}min{}s".format(m, s)) if __name__ == '__main__': billie=MovieHeven() billie.main()