问题描述
爬取博客园的首页数据URL【https://home.cnblogs.com/blog/page/1/】,之后写到自己的Excel里面
环境:
OS:Window10
python:3.7
代码
import requests import os from bs4 import BeautifulSoup import xlwt import xlrd from xlutils.copy import copy import threading import datetime class BlogHome: def __init__(self): self.url = "https://home.cnblogs.com/blog/page/{}/" self.path = r"C:pythonProjectBlog" def request(self, param): url= self.url.format(param) r = requests.get(self.url) return r.text def all_page(self, maxpage): # wbk = xlwt.Workbook() # sheet = wbk.add_sheet("Data") wbk = xlrd.open_workbook(r"C:UserspeiqiangDesktopaaa.xls", formatting_info=True) wbCopy = copy(wbk) sheet = wbCopy.get_sheet(0) row = 4 for page in range(1, maxpage): thread_lock.acquire() req = self.request(page) reRow = self.getdata(req, sheet, row) row = reRow thread_lock.release() wbCopy.save(r"C:UserspeiqiangDesktopaaa.xls") print("書き込みました") def getdata(self, req, sheet, row): soup = BeautifulSoup(req, "xml") all_title = soup.find_all(class_="post_block") for title in all_title: col = 1 # title取得 title_blank = title.find(class_="entry_title").find_all("a") print("user:", title_blank[0].string.replace("[", "").replace("]", "")) sheet.write(row, col, title_blank[0].string.replace("[", "").replace("]", "")) col += 1 print("title:", title_blank[1].string) sheet.write(row, col, title_blank[1].string) col += 1 # 評論個数 post_comment = title.find(class_="post_comment") print("評論個数:", post_comment.string) sheet.write(row, col, post_comment.string) col += 1 # 読込個数 post_view = title.find(class_="post_view") print("読込個数:", post_view.string) sheet.write(row, col, post_view.string) col += 1 # 推奨個数 # susume = title.find(class_="entry_footer") # print("推奨個数:", susume.string) # 発表日付 postdate = title.find(class_="postdate") print("発表日付:", postdate.string) sheet.write(row, col, postdate.string) col += 1 # 詳細取得 entry_summary = title.find(class_="entry_summary") print("詳細取得:", entry_summary.string) sheet.write(row, col, entry_summary.string) col += 1 row += 1 return row def writeExcel(self, row, col, data): wbk = xlwt.Workbook() sheet = wbk.add_sheet("Data", cell_overwrite_ok=True) sheet.write(row, col, data) wbk.save(r"C:UserspeiqiangDesktopaaa.xls") print("書き込みました") def mkdir(self): path = self.path.strip() isExist = os.path.exists(path) if not isExist: print('创建名字叫做', path, '的文件夹') os.makedirs(path) print('创建成功!') return True else: print(path, '文件夹已经存在了,不再创建') return False def getBlog(self): startTime = datetime.datetime.now() print("開始", startTime) self.all_page(10) endTime = datetime.datetime.now() print("実行時間:", (endTime - startTime).seconds) print("開始", startTime) print("終了", endTime) thread_lock = threading.BoundedSemaphore(value=10) blogHome = BlogHome() blogHome.getBlog()
执行上面的代码
Excel上面的数据