• python3爬虫-通过selenium获取到dj商品


    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.chrome.options import Options
    from selenium.common.exceptions import NoSuchElementException
    from lxml import etree
    import time, json
    
    JD_URL_Login = "https://www.jd.com/"
    
    
    class CustomizeException(Exception):
        def __init__(self, status, msg):
            self.status = status
            self.msg = msg
    
    
    class JD:
        def __init__(self):
            self.browser = None
            self.__init_browser()
    
        def __init_browser(self):
            options = Options()
            options.add_argument("--headless")
            options.add_experimental_option('excludeSwitches', ['enable-automation'])
            # 设置为无图模式
            options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
            self.browser = webdriver.Chrome(options=options)
            # 设置浏览器最大化窗口
            self.browser.maximize_window()
            # 隐式等待时间为3s
            self.browser.implicitly_wait(3)
            self.browser.get(JD_URL_Login)
            self.wait = WebDriverWait(self.browser, 10)
    
        def __search_goods(self, goods):
            '''搜索商品的方法'''
            self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8")
            self.wait.until(EC.presence_of_all_elements_located((By.ID, "key")))
            serach_input = self.browser.find_element_by_id("key")
            serach_input.clear()
            serach_input.send_keys(goods, Keys.ENTER)
    
        def __get_goods_info(self, page_source):
            '''从网页源码中获取到想要的数据'''
            selector_html = etree.HTML(page_source)
            # 商品名字 不要获取title属性,以后再改吧,最好是获取到商品名的文本内容
            goods_name = selector_html.xpath("//div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title")
    
            # 商品价格
            goods_price = selector_html.xpath("//div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()")
    
            # 商品评价数量
            comment_num_selector = selector_html.xpath("//div[@class='p-commit']/strong")
            comment_num = [selector.xpath("string(.)") for selector in comment_num_selector]
    
            # 商品店铺
            shop_name = selector_html.xpath("//a[@class='curr-shop']/text()")
    
            goods_zip = zip(goods_name, goods_price, comment_num, shop_name)
            for goods_info in goods_zip:
                dic = {}
                dic["goods_name"] = goods_info[0]
                dic["goods_price"] = goods_info[1]
                dic["comment_num"] = goods_info[2]
                dic["shop_name"] = goods_info[3]
                # print("商品名字>>:", goods_info[0])
                # print("商品价格>>:", goods_info[1])
                # print("商品评价数量>>:", goods_info[2])
                # print("商品店铺>>:", goods_info[3])
                # print("*" * 100)
                yield dic
    
        def __swipe_page(self):
            '''上下滑动页面,将完整的网页源码返回'''
            height = self.browser.execute_script("return document.body.scrollHeight;")
            js = "window.scrollTo(0, {});".format(height)
            self.browser.execute_script(js)
            while True:
                time.sleep(1)
                now_height = self.browser.execute_script("return document.body.scrollHeight;")
                if height == now_height:
                    return self.browser.page_source
                js = "window.scrollTo({}, {});".format(height, now_height)
                self.browser.execute_script(js)
                height = now_height
    
        def __is_element_exists(self, xpath):
            '''检测一个xpath是否能够找到'''
            try:
                self.browser.find_element_by_xpath(xpath=xpath)
                return True
            except NoSuchElementException:
                return False
    
        def __click_next_page(self):
            '''点击下一页,实现翻页功能'''
            self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "pn-next")))
            xpath = "//a[@class='pn-next']"
            if not self.__is_element_exists(xpath):
                raise CustomizeException(10000, "该商品访问完毕")
            self.browser.find_element_by_xpath(xpath).click()
    
        def __write_to_json(self, dic: dict):
            data_json = json.dumps(dic, ensure_ascii=False)
            self.file.write(data_json + "
    ")
    
        def run(self, goods):
            self.__search_goods(goods)
            n = 1
            while True:
                print("正在爬取商品 <{}>---第{}页......".format(goods, n))
                time.sleep(3)
                html = self.__swipe_page()
                for dic in self.__get_goods_info(html):
                    self.__write_to_json(dic)
                try:
                    self.__click_next_page()
                except CustomizeException:
                    try:
                        goods = goods_list.pop(0)
                        self.run(goods)
                    except IndexError:
                        return
                n += 1
    
        def __del__(self):
            self.browser.close()
            self.file.close()
    
    
    if __name__ == '__main__':
        jd = JD()
        goods_list = ["纯牛奶", "酸奶", "奶茶", "床上用品", "电磁炉", "电视", "小米笔记本", "华硕笔记本", "联想笔记本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗发露",
                      "牙刷", "牙膏", "拖鞋", "剃须刀", "水手服", "运动服", "红龙果", "苹果", "香蕉", "洗衣液", "电饭煲"]
        try:
            goods = goods_list.pop(0)
        except IndexError:
            raise CustomizeException(20000, "goods_list不能为空")
        try:
            jd.run(goods)
        finally:
            del jd
  • 相关阅读:
    Android_Spinner_example
    23.pyspider安装
    22.Windows及linux下gerapy使用
    21.scrapy爬虫部署
    12.利用kakatips对网站数据信息监控
    11.启信宝数据二次筛选解密(字符串的分割与拼接及正则匹配)-2
    10.Ubuntu操作系统及python2.7、3.5 exe
    9.数据库多表一起查询
    8.代理ip使用
    7.阿布云代理服务器试用
  • 原文地址:https://www.cnblogs.com/zhuchunyu/p/10765875.html
Copyright © 2020-2023  润新知