爬取新浪财经个股的历史财报摘要

网页的内容为:

想要的内容为:

pd.DataFrame数据结构
方便查看数据和绘图


print df.tail()
print df.columns
print df.index.name

fig,(ax1,ax2)=plt.subplots(2,1)
# fig.set_figheight(fig.get_figheight()*2)
df.ix[:,(0,3)].plot(ax=ax1)
df.ix[:,5:7].plot(ax=ax2)
ax1.set_ylabel(u'(元)')
ax2.set_ylabel(u'(百万元)')

                      每股净资产    每股收益  每股现金含量  每股资本公积金  固定资产合计   流动资产合计  
南宁糖业(000911)项目：财务摘要                                                     
2015-09-30           4.9220  0.1225 -1.8303   4.4944     NaN  3147.74   
2015-12-31           4.9146  0.1847  0.5290   4.4944     NaN  3032.40   
2016-03-31           4.5619 -0.3527 -3.1519   4.4944     NaN  3868.23   
2016-06-30           4.2956 -0.6190 -3.9426   4.4944     NaN  3522.59   
2016-09-30           4.1173 -0.7973 -2.4654   4.4944     NaN  3194.63   

                        资产总计    长期负债合计    主营业务收入      财务费用       净利润  
南宁糖业(000911)项目：财务摘要                                                   
2015-09-30           4730.00   695.012  2208.240  104.7310   39.7108  
2015-12-31           5669.74   732.949  3138.420  143.6370   59.8534  
2016-03-31           6565.34  1105.460   515.594   39.3776 -114.2920  
2016-06-30           6086.67  1033.750   920.286   90.8180 -200.6000  
2016-09-30           6021.67  1055.570  1624.810  135.9120 -258.3930  
Index([u'每股净资产', u'每股收益', u'每股现金含量', u'每股资本公积金', u'固定资产合计', u'流动资产合计', u'资产总计',
       u'长期负债合计', u'主营业务收入', u'财务费用', u'净利润'],
      dtype='object')
南宁糖业(000911)项目：财务摘要
Out[403]: <matplotlib.text.Text at 0xdde1670>

matplotlib绘图:

代码:


def get_gg_fin_abs(code='000911'):
    u'''
    Note
    -----
    - xpath表达式: 'td[1]' 表示第一个td标签, 其中的'[n]': 声明第几个标签, 1-based
    - etree对象操作: html的上层文字的获取: 
      用  .text属性 比 .xpath('text()')[0]方法 简明得多, 
      前提是: etree.element要具有text属性
    - .find(_path) and .findall(_path) 方法也很好用, 
      他们分别返回一个etree._Element对象, 或者etree._Element对象的list
    - <tbody> tag in <table> is usually added by the browser, not actually 
      in the html source. so you can not .find() or .xpath() it. 
      Use it's parent tag which is <table> to work with.
    Ref
    -----
    - Get all td content inside tr of tbody in python using lxml - Stack Overflow  
    - http://stackoverflow.com/questions/37080910/get-all-td-content-inside-tbody-of-tr-in-python-using-lxml
    '''
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_FinanceSummary/stockid/%s.html'
    url = url%(code)
    craw=crawler.Crawler(url)
    craw.idom()
    
    tr_path='//table[@id="FundHoldSharesTable"]//tr'
    trs = craw.dom.xpath(tr_path)
    print len(trs)
    
    # --------   提取日期行的注意事项            ---------
    # etree.tostring()方法可以查看: html文本
    # print etree.tostring( trs[53].xpath('td')[1])
    # <td align="left" class="tdr"><strong>2015-09-30</strong></td>&#13;
    # 得知: 该文本是加粗的文本: 位于<strong>路径下面:  所以:需要带上/strong后缀
    # k_date = tr_nodes[53].xpath('td[1]/strong').text 
    # v_date = tr_nodes[53].xpath('td[2]/strong').text
    
    #txt =  'text()'
    #txts = 'strong/text()'
    from collections import OrderedDict as Odict
    fdata= Odict()
    #for i,tr in enumerate(trs[53:65+20]):
    # //*[@id="FundHoldSharesTable"]/tbody/tr[1] : 
    # paste this xpath from  copy of Chrome F12 of 审查元素
    # 
    # handle header:  ------------------------------
    #
    header= trs[0].find('th').text
    header=header.strip()
    # ------     handle body data            -------
    for i,tr in enumerate(trs[1:]): # 从第2个tr的数据行开始
        if tr.xpath('td[@height="5px"]') != []: # empty row
            #print 'this is a empty row'
            continue
        if tr.find('td[1]').find('strong') is not  None: # 截止日期行
            vdate = tr.find('td[2]/strong').text  # 找到第二个td的文本
            dict2 = Odict() # 创建一个空的字典, for 本季度的财务摘要数据
            continue
        else:
            k2= tr.find('td[1]').text
#==============================================================================
#             # if tr.find('td[2]/a').text is not None: # 千万不要带属性测试, 因为太贪婪
#             if tr.find('td[2]/a') is not None:
#                 v2= tr.find('td[2]/a').text
#             else:
#                 v2= tr.find('td[2]').text
#==============================================================================
            # 可以更简明地编写为:
            v2= tr.find('td[2]/a').text if tr.find('td[2]/a') is not None else 
                tr.find('td[2]').text 
            dict2[k2]=v2
            # print i,k2,v2 # for debug purpose
            if k2==u'净利润':  # 本季度的最后一行数据, 需要保存小字典到大字典
                fdata[vdate] = dict2
                continue
        
    df=pd.DataFrame(fdata.values(), index=fdata.keys())   
    df.index.name=header
    #print df.head().ix[:,:2]
    #print df #//*[@id="FundHoldSharesTable"]/tbody/tr[870]


    def mapper_strdatetime10_2_datetime(s):
        u'''
        para
        -----
        - s, str, '1998-12-31'
        return
        -----
        - datetime.datetime(1998, 12, 31, 0, 0)
        '''
        y= int(s[0:4])
        m= int(s[5:7])
        d= int(s[8:10])
        return datetime(y,m,d)
    
    def mapper_html_table_td_2_float(td):
        if td.strip() is not u'':
            td = td.strip().replace(u'元','')
            td = float(td)
        else: td = np.nan
        return td

    def mapper_html_table_td_with_comma_2_float(td):
        if td.strip() is not u'':
            td = td.strip().replace(u'元','')
            if td.find(',')>0:
                td = td.replace(',','')
                td = float(td)/1000000.0
                return td
            td = float(td)/1000000.0
            return td
        else: 
            td = np.nan
        return td

    df.index = map(mapper_strdatetime10_2_datetime, df.index)
    df.index.name = header
    
    for i in np.arange(4):  #len(df.columns)):
        df.ix[:,i] = map(mapper_html_table_td_2_float, df.ix[:,i])

    for i in np.arange(4, len(df.columns)):  #len(df.columns)):
        df.ix[:,i] = map(mapper_html_table_td_with_comma_2_float, df.ix[:,i])

    # df.ix[:, :2].plot()
    return df.sort_index()

#==============================================================================
# print df.tail()
# print df.columns
# print df.index.name
# 
# fig,(ax1,ax2)=plt.subplots(2,1)
# # fig.set_figheight(fig.get_figheight()*2)
# df.ix[:,(0,3)].plot(ax=ax1)
# df.ix[:,5:7].plot(ax=ax2)
# ax1.set_ylabel(u'(元)')
# ax2.set_ylabel(u'(百万元)')
#==============================================================================

相关阅读:
jQuery操作Table学习总结[转]
SQL语句中的单引号处理以及模糊查询
 正则表达式实现将html文本转换为纯文本格式(将html字符串转换为纯文本方法)
ASP.NET中使用UpdatePanel实现局部异步刷新方法和攻略(转)
Response.Redirect在新窗口打开（转载）
position属性absolute与relative 的区别
 下载文件
 gridveiw的使用
 MarkDown和流程图诠释你的代码
 git使用笔记
原文地址：https://www.cnblogs.com/duan-qs/p/6740525.html