• python每日一题:爬虫某网站图书信息


    方案一:

    # -*- coding: UTF-8 -
    # 爬取33网站关于python的书籍,爬虫完一页后,点击后页菜单循环爬虫
    # 缺点:逐页点击并获得数据较费时
    import time
    from selenium import webdriver
    
    
    class url_surf(object):  
        def surf_web(self, url):
            num = 1
            driver = webdriver.Chrome("D:Program Files (x86)pythonchromedriver.exe")
            driver.get(url)
            time.sleep(5)
            while 1:
                ele = driver.find_elements_by_class_name(r"detail")
                file = open('python_data.txt', 'a', encoding='utf-8')
                for i in ele:
                    title = i.find_element_by_class_name("title-text")
                    bookurl = title.get_attribute('href')
                    file.write(str({title.text: bookurl}))
                    file.write('
    ')
                    print(num, bookurl)
                num += 1
                print('is searching page: %s' % num)
                try:  # 搜索到最后一页,next没办法点击或者没有后页的标签,则确认已结束
                    next_page = driver.find_element_by_class_name('next')
                    next_page.click()
                except:
                    file.close()
                    print('game over')
                    break
                time.sleep(5)
    
    
    if __name__ == "__main__":
        url = "https://book.888.com/subject_search?search_text=python+&cat=1001&start=0"
        a = url_surf()
        a.surf_web(url) # 爬取888网站关于python的书籍,爬虫完一页后,点击后页菜单循环爬虫
    方案二:

    # -*- coding: UTF-8 -
    # 爬取000网站关于python的书籍,采用多线程爬虫
    # 缺点:爬虫速度,很快,但爬完一次后,提示需要登录账号,被反爬虫了
    import time
    from multiprocessing import Pool
    from selenium import webdriver
    
    
    class url_surf(object):
        def __init__(self, url):
            self.url = url
    
        def surf_web(self, a):
            num = a
            driver = webdriver.Chrome("D:Program Files (x86)pythonchromedriver.exe")
            file = open('python_data.txt', 'a', encoding='utf-8')
            while 1:
                driver.get(self.create_url(num))
                time.sleep(5)
                ele = driver.find_elements_by_class_name(r"detail")
                for i in ele:
                    title = i.find_element_by_class_name("title-text")
                    bookurl = title.get_attribute('href')
                    file.write(str({title.text: bookurl}))
                    file.write('
    ')
                    print(num, bookurl)
                num += 10
                print('is searching page: %s' % num)
                try:  # 搜索到最后一页,next没办法点击或者没有后页的标签,则确认已结束
                    next_page = driver.find_element_by_class_name('next')
                except:
                    # file.close()
                    print(num, ':game over')
                    break
                time.sleep(5)
    
        def create_url(self, k):
            return self.url + str(int(k * 15 - 15))
    
    
    if __name__ == "__main__":
        url = "https://book.999.com/subject_search?search_text=%E6%B5%AA%E9%87%8C&cat=1001&start="
        a = url_surf(url)
        pool = Pool(5)
        for i in range(1, 11):
            pool.apply_async(a.surf_web, (i,))
        pool.close()
        pool.join()
    
    
    
     
  • 相关阅读:
    ubuntu nfs server config
    增加虚拟机ubuntu的硬盘
    MD5算法
    MySQL错误ERROR 1366 (HY000): Incorrect string value..
    SQLPro Studio链接本地MYSQL
    mysql修改root密码
    jdk7 for Mac
    mysql is stop 点击start启动不了,卸载重装
    Mac版Intellij IDEA弹窗报this license XXXXXXXX has been cancelled
    Python 环境搭建
  • 原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/10734004.html
Copyright © 2020-2023  润新知