• python每日一题:爬虫某网站图书信息


    方案一:

    # -*- coding: UTF-8 -
    # 爬取33网站关于python的书籍,爬虫完一页后,点击后页菜单循环爬虫
    # 缺点:逐页点击并获得数据较费时
    import time
    from selenium import webdriver
    
    
    class url_surf(object):  
        def surf_web(self, url):
            num = 1
            driver = webdriver.Chrome("D:Program Files (x86)pythonchromedriver.exe")
            driver.get(url)
            time.sleep(5)
            while 1:
                ele = driver.find_elements_by_class_name(r"detail")
                file = open('python_data.txt', 'a', encoding='utf-8')
                for i in ele:
                    title = i.find_element_by_class_name("title-text")
                    bookurl = title.get_attribute('href')
                    file.write(str({title.text: bookurl}))
                    file.write('
    ')
                    print(num, bookurl)
                num += 1
                print('is searching page: %s' % num)
                try:  # 搜索到最后一页,next没办法点击或者没有后页的标签,则确认已结束
                    next_page = driver.find_element_by_class_name('next')
                    next_page.click()
                except:
                    file.close()
                    print('game over')
                    break
                time.sleep(5)
    
    
    if __name__ == "__main__":
        url = "https://book.888.com/subject_search?search_text=python+&cat=1001&start=0"
        a = url_surf()
        a.surf_web(url) # 爬取888网站关于python的书籍,爬虫完一页后,点击后页菜单循环爬虫
    方案二:

    # -*- coding: UTF-8 -
    # 爬取000网站关于python的书籍,采用多线程爬虫
    # 缺点:爬虫速度,很快,但爬完一次后,提示需要登录账号,被反爬虫了
    import time
    from multiprocessing import Pool
    from selenium import webdriver
    
    
    class url_surf(object):
        def __init__(self, url):
            self.url = url
    
        def surf_web(self, a):
            num = a
            driver = webdriver.Chrome("D:Program Files (x86)pythonchromedriver.exe")
            file = open('python_data.txt', 'a', encoding='utf-8')
            while 1:
                driver.get(self.create_url(num))
                time.sleep(5)
                ele = driver.find_elements_by_class_name(r"detail")
                for i in ele:
                    title = i.find_element_by_class_name("title-text")
                    bookurl = title.get_attribute('href')
                    file.write(str({title.text: bookurl}))
                    file.write('
    ')
                    print(num, bookurl)
                num += 10
                print('is searching page: %s' % num)
                try:  # 搜索到最后一页,next没办法点击或者没有后页的标签,则确认已结束
                    next_page = driver.find_element_by_class_name('next')
                except:
                    # file.close()
                    print(num, ':game over')
                    break
                time.sleep(5)
    
        def create_url(self, k):
            return self.url + str(int(k * 15 - 15))
    
    
    if __name__ == "__main__":
        url = "https://book.999.com/subject_search?search_text=%E6%B5%AA%E9%87%8C&cat=1001&start="
        a = url_surf(url)
        pool = Pool(5)
        for i in range(1, 11):
            pool.apply_async(a.surf_web, (i,))
        pool.close()
        pool.join()
    
    
    
     
  • 相关阅读:
    C#中属性和字段的区别
    利用原生态的(System.Web.Extensions)JavaScriptSerializer将mvc 前台提交到controller序列化复杂对象
    点击图片后放大居中显示
    Python logging模块
    MySQL免安装版 配置
    python 字符串 常用方法
    python 格式化输出
    欢迎使用CSDN-markdown编辑器
    Program received signal SIGSEGV, Segmentation fault.
    error MSB6006: “CL.exe”已退出,代码为 -1073741502。
  • 原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/10734004.html
Copyright © 2020-2023  润新知