• python 爬取网页天天基金


    # encoding=utf-8
    import pandas as pd
    import requests
    from lxml import etree
    import re
    import collections
    
    
    def fund_code_name():
        """ 筛选天天基金,6千多基金机构的,最近一周收益率排在前50强基金"""
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Referer': 'http://fund.eastmoney.com/data/fundranking.html',
            'Cookie': 'st_si=51694067779834; st_asi=delete; ASP.NET_SessionId=e1pno0koqkcp5es3xyzyrg1n; EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; _adsame_fullscreen_18503=1; EMFUND9=08-16 01:16:38@#$%u4E07%u5BB6%u65B0%u5229%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408@%23%24519191; st_pvi=87492384111747; st_sp=2020-08-16%2000%3A05%3A17; st_inirUrl=http%3A%2F%2Ffund.eastmoney.com%2Fdata%2Ffundranking.html; st_sn=15; st_psi=20200816011636912-0-9218336114'
    
        }
        response = requests.get(
            url='http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=2018-11-26&ed=2019-11-26&qdii'
                '=&tabSubtype=,,,,,&pi=1&pn=6450&dx=1&v=0.6516597604405057', headers=header)
        text = response.text
        data = text.split('=')[1]
        # print(data)
        compile_data = re.findall("{datas:\[(.*)\],allRecords", str(data))[0]
        strip_data = str(compile_data).strip('[').strip(']')
        replace_quta = strip_data.replace('"', "")
        quota_arrays = replace_quta.split(",")
        intervals = [[i * 25, (i + 1) * 25] for i in range(258)]
        narrays = []
        for k in intervals:
            start, end = k[0], k[1]
            line = quota_arrays[start:end]
            narrays.append(line)
        header = ["基金代码", "基金简称", "基金条码", "日期",
                  "单位净值", "累计净值", "日增长率", "近1周增长率", "近1月增长率", "近3月", "近半年", "近1年", "近2年", "近3年",
                  "今年来", "成立来", "其他1", "其他2", "其他3", "其他4", "其他5", "其他6", "其他7", "其他8", "其他9"]
        df = pd.DataFrame(narrays, columns=header)
        df_part = df[["基金代码", "基金简称", "日期",
                      "单位净值", "累计净值", "日增长率", "近1周增长率", "近1月增长率", "近3月", "近半年"]]
    
        df_tmp = df_part.sort_values(by=["近1周增长率"], ascending=False, axis=0)
        rank_fund_code = df_tmp.head(50)["基金代码"]
        fund_codes_list = rank_fund_code.values.tolist()
        print("前50强基金:", fund_codes_list)
        df_tmp.head(50).to_csv("./本季度前50强基金收益.csv", encoding="utf_8_sig")
        return fund_codes_list
    
    
    def get_one_fund_stocks(fund_code):
        """根据基金码,获取每一支基金的最新一季度所有持仓股票池前10支股票"""
        url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code={}&topline=10&year=&month=&rt=0.5032668912422176".format(
            fund_code)
        head = {
            "Cookie": "EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; st_si=44023331838789; st_asi=delete; EMFUND9=08-16 22:04:25@#$%u4E07%u5BB6%u65B0%u5229%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408@%23%24519191; ASP.NET_SessionId=45qdofapdlm1hlgxapxuxhe1; st_pvi=87492384111747; st_sp=2020-08-16%2000%3A05%3A17; st_inirUrl=http%3A%2F%2Ffund.eastmoney.com%2Fdata%2Ffundranking.html; st_sn=12; st_psi=2020081622103685-0-6169905557"
            ,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}
    
        response = requests.get(url, headers=head)
        text = response.text  # html subsitue text
        div = re.findall('content:\"(.*)\",arryear', text)[0]
        html_body = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>test</title></head><body>%s</body></html>' % (
            div)
        html = etree.HTML(html_body)
        stock_info = html.xpath('//div[1]/div/table/tbody/tr/td/a')
        stock_money = html.xpath('//div[1]/div/table/tbody/tr/td')
        stock_one_fund = []
        for stock in stock_info:
            if stock.text and stock.text.isdigit():
                stock_one_fund.append(stock.text)
        if len(stock_one_fund)>1:
            print("基金代码:{}".format(fund_code), "基金持有前10股票池", stock_one_fund)
        return stock_one_fund  # can return empty list
    
    
    def static_best_stock(rank=20):
        """ 统计收益最佳前50机构共同持有股票代码情况,修改rank数量可调整展示股票排名数目"""
        rank_codes = fund_code_name()
        stocks_array = []
        for index, code in enumerate(rank_codes):
            if index < 1:
                print("<" * 30 + "FBI WARNING近1周收益最高基金的排名高到低排序以及股票池情况" + ">" * 30)
            stocks = get_one_fund_stocks(code)
            if len(stocks) > 1 and stocks:
                stocks_array.extend(stocks)
        count_each_stock = collections.Counter(stocks_array)
        print("<" * 30 + "FBI WARNING,{}".format(static_best_stock.__doc__) + ">" * 30)
        print("#" * 30 + "本季度基金机构共同持有股票数目排行前{}股票代码情况".format(rank) + "#" * 30)
        df=pd.DataFrame.from_dict(count_each_stock,orient='index',columns=["持有该股机构数目"])
        df=df.reset_index().rename(columns={"index":"股票代码"})
        # for k, v in count_each_stock.items():
        #     print("股票代码: ", k, "持有该股票机构数量: ", v)
        df=df.sort_values(by="持有该股机构数目",ascending=False)
        print(df.head(rank))
    
    
    if __name__ == '__main__':
        static_best_stock()

     备注:本文只为个人练习学习,如果用于违法行为概不负责

  • 相关阅读:
    [USACO15JAN]草鉴定Grass Cownoisseur (分层图,最长路,$Tarjan$)
    P1558 色板游戏 (线段树)
    [Vani有约会]雨天的尾巴 (线段树合并)
    UVA11806 Cheerleaders (容斥)
    [JSOI2007]建筑抢修 (贪心)
    [POI2015]KIN (线段树)
    [六省联考2017]组合数问题 (矩阵优化$dp$)
    [BZOJ2957] 楼房重建 (线段树,递归)
    [USACO Section 5.3]量取牛奶 Milk Measuring (动态规划,背包$dp$)
    P2647 最大收益 (动态规划)
  • 原文地址:https://www.cnblogs.com/SunshineKimi/p/13517365.html
Copyright © 2020-2023  润新知