京东手机信息爬取（全部手机）

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

仅学习参考，不可用于商业用途

version_0

说明：单线程爬虫，使用模块为python自带模块,包括urllib，json等

　　　写这个爬虫是为了熟悉urllib的基本使用，包括常用函数.urllib.build_opener()、urllib.parse.urljoin、urllib.parse.quote、urllib.request.urlopen

　　　urllib.request.install_opener()、http.cookiejar、urllib.request.HTTPHandler()、urllib.request.HTTPCookiesProcessor()

　　　请求频率通过random.uniform()，随机选取

　　　本爬虫目前只支持获取手机页面的信息。

　　　所有的图片信息，以链接方式保存。可以使用urllib.request.urlretrieve()下载。

　　　若要构造多线程爬虫，请参考：https://www.cnblogs.com/nuochengze/p/12861358.html

效果预览：

源码如下：

from urllib import request
from urllib import parse
from urllib import error
from http import cookiejar
import re
from pprint import pprint
import time
import random
import json


class JdPhoneInfo(object):
    def __init__(self,key_word):
        self.key_word = key_word

    def get_url(self,key_word,page_num,page_count):
        url_list = list()
        url_base = "https://search.jd.com/s_new.php?keyword=%E6%89%8B%E6%9C%BA&page=2&s=30"
        while page_num<page_count:
            info = {
                "keyword":key_word,
                "page":page_num+1,
                "s":page_num*30,
            }
            url_ = "s_new.php?"+parse.urlencode(info)
            url = parse.urljoin(base=url_base,url=url_)
            url_list.append(url)
            page_num += 1
        return url_list

    def parse_info(self,html_str):
        """获取整页的响应信息，包括page_count,page_current"""
        page_info = dict()
        # 获取页面总数
        page_count = re.compile(r'page_count:"(.*?)"',re.S).findall(html_str)
        page_info["page_count"] = int(page_count[0]) if page_count else None
        # 获取页面当页数
        page_current = re.compile(r'page:"(.*?)",page_count',re.S).findall(html_str)
        page_info["page_current"] = int(page_current[0]) if page_count else None
        # 获取所有的产品信息
        page_info["product_list"] = list()
        product_info_list = re.compile(r'class="p-img"(.*?)class="p-icons"', re.S).findall(html_str)
        ## 获取单个产品的信息
        for one_product_info in product_info_list:
            info = dict()
            # 获取标题及链接
            str_ = re.compile(r'p-name p-name-type-2(.*?)</div>',re.S).findall(one_product_info)[0]
            title = re.compile(r'em>(.*?)</em>',re.S).findall(str_)
            info["title"] =re.sub(r'
|	|s|(<.*?>)','',title[0]).strip() if title else None
            href = re.compile(r'href="(.*?)"',re.S).findall(str_)
            info["href"] = "https:"+href[0] if href else None
            # 获取价格
            str_ = re.compile(r'class="p-price"(.*?)</div>',re.S).findall(one_product_info)[0]
            price = re.compile(r'i>(.*?)</i>', re.S).findall(str_)
            info["price"] = price[0] if price else None
            # 获取图片
            info["pic_info"] = list()
            img_list = re.compile(r'class="ps-item">(.*?)</li>',re.S).findall(one_product_info)
            for img in img_list:
                pic_info_ = dict()
                pic_title = re.compile(r'title="(.*?)">',re.S).findall(img)
                pic_info_["pic_title"] = pic_title[0] if pic_title else None
                pic_href = re.compile(r'data-lazy-img="(.*?)"',re.S).findall(img)
                pic_info_["pic_href"] = "https:"+pic_href[0] if pic_href else "---"
                info["pic_info"].append(pic_info_)
            # 获取评价连接
            info["comment_href"] = info["href"]+"#comment"
            # 获取售卖店铺及链接
            info["store"] = dict()
            str_ = re.compile(r'class="p-shop"(.*?)</div>',re.S).findall(one_product_info)[0]
            shop_name = re.compile(r'title="(.*?)"',re.S).findall(str_)
            info["store"]["shop_name"] = shop_name[0] if shop_name else None
            shop_href = re.compile(r'href="(.*?)"', re.S).findall(str_)
            info["store"]["shop_href"] = "https:"+shop_href[0] if shop_href else None
            # 将单个产品添加到产品列表
            page_info["product_list"].append(info)
        return page_info

    def get_request(self,first_url,url=None,url_index_num=None,url_list=None):
        # 构造cookie_handler和https_handler处理器
        cookjar_ = cookiejar.CookieJar()
        cookie_handler = request.HTTPCookieProcessor(cookjar_)
        https_handler = request.HTTPSHandler()
        opener = request.build_opener(cookie_handler, https_handler)
        request.install_opener(opener)
        use_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
        if url_list is not None:
            request_ = request.Request(url=url)
            request_.add_header("User-Agent", use_agent)
            if url_index_num == 0:
                request_.add_header(key="referer",val=first_url)
            else:
                request_.add_header(key="referer",val=url_list[url_index_num-1])
        else:
            # Request实例
            request_ = request.Request(url=first_url)
            # 添加header
            request_.add_header("User-Agent", use_agent)
        response_ = request.urlopen(request_)
        return response_
    
    def save_content(self,info):
        with open("jindong_phone_info.json",'a+',encoding='utf8') as f:
            f.write(json.dumps(info,ensure_ascii=False,indent=2))
            print("当前写入url",info["page_current"])

    def run(self):

        first_url = "https://search.jd.com/Search?keyword={}".format(parse.quote(self.key_word))
        # 获取页面的总页数
        ## 请求第一页
        first_response_html = self.get_request(first_url=first_url).read().decode()
        ## 提取信息
        page_info = self.parse_info(first_response_html)    # page_info接收一个字典
        # 保存内容
        self.save_content(page_info)
        # 获取构造的所有url
        url_list = self.get_url(self.key_word,page_num=page_info["page_current"],page_count=page_info["page_count"])
        for url in url_list:
            response_html = self.get_request(first_url=first_url,url=url,url_list=url_list,url_index_num=url_list.index(url)).read().decode()
            page_info = self.parse_info(response_html)
            # 保存内容
            self.save_content(page_info)
            num = random.uniform(1,2)
            time.sleep(num)
            
    
if __name__=="__main__":
    # key_word = input("请输入关键字:")
    key_word = "手机"
    print("本程序将采集以下信息:标题及连接，价格，图片，评价连接，售卖店铺及链接")
    obj = JdPhoneInfo(key_word)
    obj.run()

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

相关阅读:
CentOS7安装mysql-8
zabbix监控规划及实施
 集群技术
 自动化脚本-配置LVS（DR模式）
Pacemaker+ISCSI实现Apache高可用-配置
 创建集群corosync
我的第一个python程序——猜数字
 质量报告
 新需求测试与回归测试
 冒烟测试
原文地址：https://www.cnblogs.com/nuochengze/p/13044019.html

最新文章
Day8
实现mongodb通讯
 Day8
应该注意的点
 安装Robomongo
路由修改，集中在index.js
大战之前的感想
 HDU 6203
hdu 6194
HDU 6198

热门文章
HDU 6040
HDU 6044
HDU 6035
HDU 1495
HDU 4763
CF 862C
MySQL5.5升级至5.7
Docker常用命令及脚本
 redis主从切换
 MySQL5.7的参数优化