• 京东手机信息爬取(全部手机)


    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

    仅学习参考,不可用于商业用途

    version_0

    说明:单线程爬虫,使用模块为python自带模块,包括urllib,json等

       写这个爬虫是为了熟悉urllib的基本使用,包括常用函数.urllib.build_opener()、urllib.parse.urljoin、urllib.parse.quote、urllib.request.urlopen

       urllib.request.install_opener()、http.cookiejar、urllib.request.HTTPHandler()、urllib.request.HTTPCookiesProcessor()

       请求频率通过random.uniform(),随机选取

       本爬虫目前只支持获取手机页面的信息。

       所有的图片信息,以链接方式保存。可以使用urllib.request.urlretrieve()下载。

       若要构造多线程爬虫,请参考:https://www.cnblogs.com/nuochengze/p/12861358.html

    效果预览:

      

    源码如下:

    from urllib import request
    from urllib import parse
    from urllib import error
    from http import cookiejar
    import re
    from pprint import pprint
    import time
    import random
    import json
    
    
    class JdPhoneInfo(object):
        def __init__(self,key_word):
            self.key_word = key_word
    
        def get_url(self,key_word,page_num,page_count):
            url_list = list()
            url_base = "https://search.jd.com/s_new.php?keyword=%E6%89%8B%E6%9C%BA&page=2&s=30"
            while page_num<page_count:
                info = {
                    "keyword":key_word,
                    "page":page_num+1,
                    "s":page_num*30,
                }
                url_ = "s_new.php?"+parse.urlencode(info)
                url = parse.urljoin(base=url_base,url=url_)
                url_list.append(url)
                page_num += 1
            return url_list
    
        def parse_info(self,html_str):
            """获取整页的响应信息,包括page_count,page_current"""
            page_info = dict()
            # 获取页面总数
            page_count = re.compile(r'page_count:"(.*?)"',re.S).findall(html_str)
            page_info["page_count"] = int(page_count[0]) if page_count else None
            # 获取页面当页数
            page_current = re.compile(r'page:"(.*?)",page_count',re.S).findall(html_str)
            page_info["page_current"] = int(page_current[0]) if page_count else None
            # 获取所有的产品信息
            page_info["product_list"] = list()
            product_info_list = re.compile(r'class="p-img"(.*?)class="p-icons"', re.S).findall(html_str)
            ## 获取单个产品的信息
            for one_product_info in product_info_list:
                info = dict()
                # 获取标题及链接
                str_ = re.compile(r'p-name p-name-type-2(.*?)</div>',re.S).findall(one_product_info)[0]
                title = re.compile(r'em>(.*?)</em>',re.S).findall(str_)
                info["title"] =re.sub(r'
    |	|s|(<.*?>)','',title[0]).strip() if title else None
                href = re.compile(r'href="(.*?)"',re.S).findall(str_)
                info["href"] = "https:"+href[0] if href else None
                # 获取价格
                str_ = re.compile(r'class="p-price"(.*?)</div>',re.S).findall(one_product_info)[0]
                price = re.compile(r'i>(.*?)</i>', re.S).findall(str_)
                info["price"] = price[0] if price else None
                # 获取图片
                info["pic_info"] = list()
                img_list = re.compile(r'class="ps-item">(.*?)</li>',re.S).findall(one_product_info)
                for img in img_list:
                    pic_info_ = dict()
                    pic_title = re.compile(r'title="(.*?)">',re.S).findall(img)
                    pic_info_["pic_title"] = pic_title[0] if pic_title else None
                    pic_href = re.compile(r'data-lazy-img="(.*?)"',re.S).findall(img)
                    pic_info_["pic_href"] = "https:"+pic_href[0] if pic_href else "---"
                    info["pic_info"].append(pic_info_)
                # 获取评价连接
                info["comment_href"] = info["href"]+"#comment"
                # 获取售卖店铺及链接
                info["store"] = dict()
                str_ = re.compile(r'class="p-shop"(.*?)</div>',re.S).findall(one_product_info)[0]
                shop_name = re.compile(r'title="(.*?)"',re.S).findall(str_)
                info["store"]["shop_name"] = shop_name[0] if shop_name else None
                shop_href = re.compile(r'href="(.*?)"', re.S).findall(str_)
                info["store"]["shop_href"] = "https:"+shop_href[0] if shop_href else None
                # 将单个产品添加到产品列表
                page_info["product_list"].append(info)
            return page_info
    
        def get_request(self,first_url,url=None,url_index_num=None,url_list=None):
            # 构造cookie_handler和https_handler处理器
            cookjar_ = cookiejar.CookieJar()
            cookie_handler = request.HTTPCookieProcessor(cookjar_)
            https_handler = request.HTTPSHandler()
            opener = request.build_opener(cookie_handler, https_handler)
            request.install_opener(opener)
            use_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
            if url_list is not None:
                request_ = request.Request(url=url)
                request_.add_header("User-Agent", use_agent)
                if url_index_num == 0:
                    request_.add_header(key="referer",val=first_url)
                else:
                    request_.add_header(key="referer",val=url_list[url_index_num-1])
            else:
                # Request实例
                request_ = request.Request(url=first_url)
                # 添加header
                request_.add_header("User-Agent", use_agent)
            response_ = request.urlopen(request_)
            return response_
        
        def save_content(self,info):
            with open("jindong_phone_info.json",'a+',encoding='utf8') as f:
                f.write(json.dumps(info,ensure_ascii=False,indent=2))
                print("当前写入url",info["page_current"])
    
        def run(self):
    
            first_url = "https://search.jd.com/Search?keyword={}".format(parse.quote(self.key_word))
            # 获取页面的总页数
            ## 请求第一页
            first_response_html = self.get_request(first_url=first_url).read().decode()
            ## 提取信息
            page_info = self.parse_info(first_response_html)    # page_info接收一个字典
            # 保存内容
            self.save_content(page_info)
            # 获取构造的所有url
            url_list = self.get_url(self.key_word,page_num=page_info["page_current"],page_count=page_info["page_count"])
            for url in url_list:
                response_html = self.get_request(first_url=first_url,url=url,url_list=url_list,url_index_num=url_list.index(url)).read().decode()
                page_info = self.parse_info(response_html)
                # 保存内容
                self.save_content(page_info)
                num = random.uniform(1,2)
                time.sleep(num)
                
        
    if __name__=="__main__":
        # key_word = input("请输入关键字:")
        key_word = "手机"
        print("本程序将采集以下信息:标题及连接,价格,图片,评价连接,售卖店铺及链接")
        obj = JdPhoneInfo(key_word)
        obj.run()

    <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

  • 相关阅读:
    CentOS7安装mysql-8
    zabbix监控规划及实施
    集群技术
    自动化脚本-配置LVS(DR模式)
    Pacemaker+ISCSI实现Apache高可用-配置
    创建集群corosync
    我的第一个python程序——猜数字
    质量报告
    新需求测试与回归测试
    冒烟测试
  • 原文地址:https://www.cnblogs.com/nuochengze/p/13044019.html
Copyright © 2020-2023  润新知