方案一:
# -*- coding: UTF-8 - # 爬取33网站关于python的书籍,爬虫完一页后,点击后页菜单循环爬虫 # 缺点:逐页点击并获得数据较费时 import time from selenium import webdriver class url_surf(object): def surf_web(self, url): num = 1 driver = webdriver.Chrome("D:Program Files (x86)pythonchromedriver.exe") driver.get(url) time.sleep(5) while 1: ele = driver.find_elements_by_class_name(r"detail") file = open('python_data.txt', 'a', encoding='utf-8') for i in ele: title = i.find_element_by_class_name("title-text") bookurl = title.get_attribute('href') file.write(str({title.text: bookurl})) file.write(' ') print(num, bookurl) num += 1 print('is searching page: %s' % num) try: # 搜索到最后一页,next没办法点击或者没有后页的标签,则确认已结束 next_page = driver.find_element_by_class_name('next') next_page.click() except: file.close() print('game over') break time.sleep(5) if __name__ == "__main__": url = "https://book.888.com/subject_search?search_text=python+&cat=1001&start=0" a = url_surf() a.surf_web(url) # 爬取888网站关于python的书籍,爬虫完一页后,点击后页菜单循环爬虫
方案二:
# -*- coding: UTF-8 - # 爬取000网站关于python的书籍,采用多线程爬虫 # 缺点:爬虫速度,很快,但爬完一次后,提示需要登录账号,被反爬虫了 import time from multiprocessing import Pool from selenium import webdriver class url_surf(object): def __init__(self, url): self.url = url def surf_web(self, a): num = a driver = webdriver.Chrome("D:Program Files (x86)pythonchromedriver.exe") file = open('python_data.txt', 'a', encoding='utf-8') while 1: driver.get(self.create_url(num)) time.sleep(5) ele = driver.find_elements_by_class_name(r"detail") for i in ele: title = i.find_element_by_class_name("title-text") bookurl = title.get_attribute('href') file.write(str({title.text: bookurl})) file.write(' ') print(num, bookurl) num += 10 print('is searching page: %s' % num) try: # 搜索到最后一页,next没办法点击或者没有后页的标签,则确认已结束 next_page = driver.find_element_by_class_name('next') except: # file.close() print(num, ':game over') break time.sleep(5) def create_url(self, k): return self.url + str(int(k * 15 - 15)) if __name__ == "__main__": url = "https://book.999.com/subject_search?search_text=%E6%B5%AA%E9%87%8C&cat=1001&start=" a = url_surf(url) pool = Pool(5) for i in range(1, 11): pool.apply_async(a.surf_web, (i,)) pool.close() pool.join()