• 淘宝爬虫


    import  requests
    import re
    from urllib import request
    import urllib.request
    import pymysql
    import time
    conn=pymysql.connect(host="127.0.0.1",user="root",passwd="123456",db="world")
    def get_url():
        for i in  range(5,11):
            headers1 = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
            }
            url='https://s.taobao.com/search?spm=a21bo.2017.201856-fline.2.1b3311d9sSXobt&q=%E5%9B%9B%E4%BB%B6%E5%A5%97&s='+str(i*44)#链接入口
            response1 = requests.get(url, headers=headers1)
            data = response1.text
            bt_1 = '"raw_title":"(.*?)","pic_url"'
            tp_1='"pic_url":"//(.*?)"'
            spid_1='"nid":"(.*?)","category"'
            xl_1='"view_sales":"(.*?)"'
            dm_1= '"nick"."(.*?)"'
            jg_1='"view_price"."(.*?)","view_fee"'
            user_id1='"user_id":"(.*?)","nick":".*?"'
            comment_url1='"detail_url":"(.*?)"'
            bt = re.compile(bt_1).findall(str(data))
            tp=re.compile(tp_1).findall(str(data))
            spid=re.compile(spid_1).findall(str(data))
            print(spid)
            xl=re.compile(xl_1).findall(str(data))
            dm=re.compile(dm_1).findall(str(data))
            jg=re.compile(jg_1).findall(str(data))
            for j in range(0,len(bt)):
                bt1=bt[j]
                tp1='https://'+tp[j]
                spid1=spid[j]
                xl1=xl[j]
                dm1=dm[j]
                headers = {
                        'Referer': 'https://item.taobao.com/item.htm?spm=a1z10.5-c-s.w4002-18518582505.20.6d887041nVz3D2&id='+spid1 ,#必须加上这个
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
                    }
                response = requests.get(
                        'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId='+spid1+'&sellerId=102291787&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess',#这个网址藏了销量数据
                        headers=headers)
                data3 = response.text
                sold = '"sellCountDO":{"sellCount":"(.*?)","success":true}'#正则匹配销量
                soldTotalCount = re.compile(sold).findall(data3)
                print('' + str(i+1) + "" + '' + str(j+1) + '')
                print(bt1)
                print(spid1)
                print(soldTotalCount)
                jg1 = jg[j]
                sql = "insert into taobaopc1(bt,tp,spid,xl,dm,jg)values('" + bt1 + "','" + tp1 + "','" + spid1 + "','" + soldTotalCount1 + "','" + dm1 + "','" + jg1 + "')"
                print(sql)
                conn.query(sql)
                conn.commit()
  • 相关阅读:
    你的背包,被我找到了(01背包问题)
    一点微小的改动,让你从B树理解到B+树
    有哪些令人拍案叫绝的算法?
    来来来,今天教你们用 CV 算法整点好玩的...
    内存都没了,还能运行程序?
    我的常用软件大公开!
    漫画:美团面试题(整数拆分)
    科普:我就想写个爬虫,到底要学多少东西啊?
    腾讯和阿里在 B 站评论区相遇了!
    程序员注意:这个群可以学英语,还全程免费!
  • 原文地址:https://www.cnblogs.com/snackpython/p/10329298.html
Copyright © 2020-2023  润新知