抓取天猫手机评论

import re
import json
import time
import requests
from bs4 import BeautifulSoup 
 
 
tm_headers = { 
            "scheme": "https",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "Cache-Control" : "max-age=0",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Content-Type": "text/html"
               
        }   
 
def req(url, headers):
    soup = None
    try:
        content = requests.get(url, headers=headers, timeout=2)
        code = content.status_code
        if code == 200:
            soup = BeautifulSoup(content.text, "html.parser")
    except Exception as e:
        print("get url error, url: {0}".format(url))
    return soup

def get_phone_list():
    #  获取列表url
    phone_list = []
    list_url = "https://shouji.tmall.com/?spm=a222t.8063993.a2226c3nav.5.7b8f4da0yjyxC3&acm=lb-zebra-155904-807029.1003.4.767290&scm=1003.4.lb-zebra-155904-807029.OTHER_14592967254716_767290#J_floor12"
    soup = req(list_url, tm_headers)
    txt = soup.find_all("li", class_="focus-")
    for i in txt[:-5]:
        a = i.find("a")
        name = i.find("h3").get_text()
        href = a.get("href")
        if name != "":
            itemid = href.split("id=")[-1].split("&")[0]
            phone_list.append({"url": "https:" + href, "name": name})
    return phone_list

def create_deltail_url(url, page=1, itemid=None, sellerid=None):
    # 生成评论地址,最终发现获取评论api 参数需要两个id，itemid and sellerid，sellerid 必须去详情页拿

    if itemid is None and sellerid is None: 
        itemid = url.split("id=")[-1].split("&")[0] 
        soup = req(url, tm_headers) 
        txt = soup.find_all("meta")[-1].get("content") 
        sellerid = txt.split("userid=")[-1].replace(";", "")
    
    comment_json_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={0}&sellerId={1}&currentPage={2}".format(itemid, sellerid, page)
    return comment_json_url, itemid, sellerid


def get_deltail(db, comment_json_url, itemid, sellerid, name):
    # 调用评论接口 获取评论数据
    pagenum = None
    comment_data = req(comment_json_url, tm_headers)
    if comment_data is not None:
        count = 1
        while "paginator" not in str(comment_data) and count < 5:
            comment_data = req(comment_json_url, tm_headers)
            count += 1
            time.sleep(1)
        try:
            comment_str = str(comment_data)[15:]
            comment_json = json.loads(comment_str)
        except Exception as e:
            return None
        rateList = comment_json["rateList"]
        for item in rateList:
            data = {}
            data["itemid"] = itemid
            data["usernick"] = item["displayUserNick"]
            data["comment_content"] = item["rateContent"]
            data["comment_date"] = item["rateDate"]
            data["sellerid"] = sellerid    
            # insert db
        pagenum = comment_json["paginator"]["lastPage"]
    return pagenum



if __name__ == "__main__":
    phone_list = get_phone_list()
    for phone_url in phone_list:
        name = phone_url["name"]
        url = phone_url["url"]
        print("开始抓取: {0}  手机, 页码: {1}".format(name, 1))
        comment_json_url, itemid, sellerid = create_deltail_url(url)
        pagenum = get_deltail(db, comment_json_url, itemid, sellerid, name)
        if pagenum is not None:
            page = 2
            while page < pagenum:
                print("开始抓取: {0} 手机, 页码: {1}".format(name, page))
                comment_json_url, itemid, sellerid = create_deltail_url(phone_url["url"], page, itemid, sellerid)
                get_deltail(db, comment_json_url, itemid, sellerid, name)
                page += 1
                time.sleep(2)

教程仅供技术研究学习使用，若有侵权，联系本人删除

相关阅读:
【题解】【BT】【Leetcode】Populating Next Right Pointers in Each Node
【题解】【BT】【Leetcode】Binary Tree Level Order Traversal
【题解】【BST】【Leetcode】Unique Binary Search Trees
【题解】【矩阵】【回溯】【Leetcode】Rotate Image
【题解】【排列组合】【素数】【Leetcode】Unique Paths
【题解】【矩阵】【回溯】【Leetcode】Unique Paths II
【题解】【BST】【Leetcode】Validate Binary Search Tree
【题解】【BST】【Leetcode】Convert Sorted Array to Binary Search Tree
第 10 章判断用户是否登录
 第 8 章动态管理资源结合自定义登录页面
原文地址：https://www.cnblogs.com/dockers/p/7767914.html