• python学习日记——小作业之抓取新氧数据


    # 抓取新氧数据
    import requests
    import json
    import xlwt
    from bs4 import BeautifulSoup
    
    proxies={"http": "http://49.70.64.155:9999", "https": "http://59.57.148.70:9999", }
    # 初始化表格行数
    row=0
    def get_shuhouhuli(url_diclist):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
        }
        for url_dic in url_diclist:
            workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
            sheet = workbook.add_sheet('doctorinfo', cell_overwrite_ok=True)
            for k,v in url_dic.items():
                response = requests.get(v, headers = headers)
                soup=BeautifulSoup(response.text,'lxml')
                shuhouhulilist=soup.select("#surgery_after > div > div")
                cols=0
                global row
                for shuhouhuli in shuhouhulilist:
                    print(shuhouhuli.text)
                    sheet.write(row, cols, shuhouhuli.text)
                    cols = cols + 1
                row = row + 1
            workbook.save("xinyanginfo.xls")
    
    def get_finalurl(preurl):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
        }
        finalurl = []
        try:
            response=json.loads(requests.get(preurl,headers=headers).text)
            for info in response:
                try:
                    pinyin=info["seo"]["pinyin"]
                    finalurl.append({info["name"]: "https://www.soyoung.com/itemk/" + pinyin + "/"})
                except:
                    print(info)
        except:
            print(preurl+"不可用")
        return finalurl
    
    def scra_data():
        workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
        sheet = workbook.add_sheet('xinyanginfo', cell_overwrite_ok=True)
        url=""
        try:
            for i in range(20155,20244):
                # 得到一级url
                url="https://www.soyoung.com/items/itemList?_json=1&menu_id="+str(i)
                # 根据一级url抓取得到二级url的字典的列表
                finalurldic=get_finalurl(url)
                # 根据二级url抓取得到信息
                for url_dic in finalurldic:
                    headers = {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
                    }
                    for k, v in url_dic.items():
                        response = requests.get(v, headers=headers)
                        soup = BeautifulSoup(response.text, 'lxml')
                        shuhouhulilist = soup.select("#surgery_after > div > div")
                        cols = 2
                        global row
                        sheet.write(row, 0, k)
                        sheet.write(row, 1, v)
                        for shuhouhuli in shuhouhulilist:
                            sheet.write(row, cols, shuhouhuli.text)
                            cols = cols + 1
                        row = row + 1
        except:
            workbook.save("xinyanginfo.xls")
            print(url)
        workbook.save("xinyanginfo.xls")
    
    scra_data()
    

      记录一下抓取的代码,因为新氧的安全策略,所以代理需要频繁替换,估计抓四次左右即可抓全数据

  • 相关阅读:
    Endnote
    C#在子线程Thread中使用await会出问题
    httpwebrequest抓取网页数据非字符串时要使用流直接写文件
    此流不支持查找操作
    http请求头中Referer的含义和作用
    C# Net Core 使用 ClientWebSocket 实现 WebSocket 客户端
    C# 实现WebSocket服务端实例
    WebSocket 协议初探
    WebSocket技术
    WebSocket的使用
  • 原文地址:https://www.cnblogs.com/ftxy/p/11831306.html
Copyright © 2020-2023  润新知