• 抓取天猫手机评论


    import re
    import json
    import time
    import requests
    from bs4 import BeautifulSoup 
     
     
    tm_headers = { 
                "scheme": "https",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "Cache-Control" : "max-age=0",
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Content-Type": "text/html"
                   
            }   
     
    def req(url, headers):
        soup = None
        try:
            content = requests.get(url, headers=headers, timeout=2)
            code = content.status_code
            if code == 200:
                soup = BeautifulSoup(content.text, "html.parser")
        except Exception as e:
            print("get url error, url: {0}".format(url))
        return soup
    
    def get_phone_list():
        #  获取列表url
        phone_list = []
        list_url = "https://shouji.tmall.com/?spm=a222t.8063993.a2226c3nav.5.7b8f4da0yjyxC3&acm=lb-zebra-155904-807029.1003.4.767290&scm=1003.4.lb-zebra-155904-807029.OTHER_14592967254716_767290#J_floor12"
        soup = req(list_url, tm_headers)
        txt = soup.find_all("li", class_="focus-")
        for i in txt[:-5]:
            a = i.find("a")
            name = i.find("h3").get_text()
            href = a.get("href")
            if name != "":
                itemid = href.split("id=")[-1].split("&")[0]
                phone_list.append({"url": "https:" + href, "name": name})
        return phone_list
    
    def create_deltail_url(url, page=1, itemid=None, sellerid=None):
        # 生成评论地址,最终发现获取评论api 参数需要两个id,itemid and sellerid,sellerid 必须去详情页拿
        if itemid is None and sellerid is None: 
    itemid
    = url.split("id=")[-1].split("&")[0]
    soup
    = req(url, tm_headers)
    txt
    = soup.find_all("meta")[-1].get("content")
    sellerid
    = txt.split("userid=")[-1].replace(";", "")
    comment_json_url
    = "https://rate.tmall.com/list_detail_rate.htm?itemId={0}&sellerId={1}&currentPage={2}".format(itemid, sellerid, page) return comment_json_url, itemid, sellerid def get_deltail(db, comment_json_url, itemid, sellerid, name): # 调用评论接口 获取评论数据 pagenum = None comment_data = req(comment_json_url, tm_headers) if comment_data is not None: count = 1 while "paginator" not in str(comment_data) and count < 5: comment_data = req(comment_json_url, tm_headers) count += 1 time.sleep(1) try: comment_str = str(comment_data)[15:] comment_json = json.loads(comment_str) except Exception as e: return None rateList = comment_json["rateList"] for item in rateList: data = {} data["itemid"] = itemid data["usernick"] = item["displayUserNick"] data["comment_content"] = item["rateContent"] data["comment_date"] = item["rateDate"] data["sellerid"] = sellerid # insert db pagenum = comment_json["paginator"]["lastPage"] return pagenum if __name__ == "__main__": phone_list = get_phone_list() for phone_url in phone_list: name = phone_url["name"] url = phone_url["url"] print("开始抓取: {0} 手机, 页码: {1}".format(name, 1)) comment_json_url, itemid, sellerid = create_deltail_url(url) pagenum = get_deltail(db, comment_json_url, itemid, sellerid, name) if pagenum is not None: page = 2 while page < pagenum: print("开始抓取: {0} 手机, 页码: {1}".format(name, page)) comment_json_url, itemid, sellerid = create_deltail_url(phone_url["url"], page, itemid, sellerid) get_deltail(db, comment_json_url, itemid, sellerid, name) page += 1 time.sleep(2)

      教程仅供技术研究学习使用,若有侵权,联系本人删除

  • 相关阅读:
    【题解】【BT】【Leetcode】Populating Next Right Pointers in Each Node
    【题解】【BT】【Leetcode】Binary Tree Level Order Traversal
    【题解】【BST】【Leetcode】Unique Binary Search Trees
    【题解】【矩阵】【回溯】【Leetcode】Rotate Image
    【题解】【排列组合】【素数】【Leetcode】Unique Paths
    【题解】【矩阵】【回溯】【Leetcode】Unique Paths II
    【题解】【BST】【Leetcode】Validate Binary Search Tree
    【题解】【BST】【Leetcode】Convert Sorted Array to Binary Search Tree
    第 10 章 判断用户是否登录
    第 8 章 动态管理资源结合自定义登录页面
  • 原文地址:https://www.cnblogs.com/dockers/p/7767914.html
Copyright © 2020-2023  润新知