• 网络爬虫爬取淘宝商品页面数据


    import requests
    import re
    from bs4 import BeautifulSoup
    import bs4
    
    '''
    数据线起始页https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306
    数据线第二页https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s=44
    数据线第三页https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=88
    '''
    def get_html_text(url):
        try:
            r=requests.get(url,timeout=30)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.text
        except :
            return '该网页请求连接失败'
    
    #通过脚本语言编写的html代码,不是完整的html语言,直接搜索相对比较简单
    #正则表达式原生字符串你还没有理解,最小匹配原则  视频时间11:20
    def parse_page(list_info,html):
        try:
            list_price=re.findall(r'"view_price":"[d.]*"',html)
            list_title=re.findall(r'"raw_title":".*?"',html)
            list_location=re.findall(r'"item_loc":".*?"',html)
            #list_num_payment=re.findall(r'"view_sales":"u"',html)
            for i in range(len(list_price)):
                price=eval(list_price[i].split(':')[1])
                title=eval(list_title[i].split(':')[1])
                location=eval(list_location[i].split(':')[1])
                #num_payment=eval(list_num_payment.split(':')[1])
                #list_info.append([price,num_payment,location,title])
                list_info.append([price,location,title])
        except :
            print('解析网页出现异常')
    
    def print_goods_info(list_info):
        #tplt='{:4}	{:8}	{:8}	{:12}	{:20}	'
        tplt='{:4}	{:8}	{:12}	{:20}	'
        #print(tplt.format('序号','商品价格','付款人数','发货地址','商品名称'))
        print(tplt.format('序号','商品价格','发货地址','商品名称'))
        count=0
        for goods in list_info:
            count+=1
            #print(tplt.format(count,goods[0],goods[1],goods[2],goods[3]))
            print(tplt.format(count,goods[0],goods[1],goods[2]))
    
    if __name__ == '__main__':
        goods='书包'
        depth=2
        start_url='https://s.taobao.com/search?q='+goods
        list_info=[]
        for i in range(depth):
            try:
                url=start_url+'&s='+str(44*i)
                html=get_html_text(url)
                parse_page(list_info,html)
            except:
                continue        #如果某一个页面出现了问题,则会跳过该页面的解析,而不会影响程序的整体运行
        print_goods_info(list_info)
        
  • 相关阅读:
    action里设置session
    用Eclipse导出能直接运行的jar包
    linux下命令行运行jar文件出错
    扫描对方主机端口
    调用Axis Webservice异常: java.net.ConnectException: Connection timed out: connect
    PL/SQL 安装出现的一些问题
    调用Axis WebService异常:org.xml.sax.SAXException: Bad envelope tag: definitions
    开通博客第一天
    正则表达式
    匿名函数
  • 原文地址:https://www.cnblogs.com/liberate20/p/10799769.html
Copyright © 2020-2023  润新知