• 示例


    最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

    球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

    在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

    public void Run()
    {
        Logger.ClearAll();
        for(int i=1; i<=30; i++)
        {
            Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
            Default.Ready();
            var teamid = i;
            var teamname = Default.SelectSingleNode("div.blockA>h2>span");
            Logger.Log(teamname.Text());
            var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
            Logger.Log(teamurl.Text());
            var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
            Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
            var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
            Logger.Log(gym.Text().Replace("主体育馆:",""));
            var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
            Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
            var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
            Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
            var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
            Logger.Log(champion.Text().Replace("获总冠军次数:",""));
            var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
            Logger.Log(coach.Text().Replace("现任主教练:",""));
            DataManager.AppendData("TEAM",
                                   DataEntry.Create()
                                   .Set("teamid", teamid+"")
                                   .Set("teamname", teamname.Text())
                                   .Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
                                   .Set("gym",gym.Text().Replace("主体育馆:",""))
                                   .Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
                                   .Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
                                   .Set("champion", champion.Text().Replace("获总冠军次数:",""))
                                   .Set("coach", coach.Text().Replace("现任主教练:",""))
                                  );
            Logger.Log(i.ToString());
            var playelist = Default.SelectNodes("div.tab>table tr");
            foreach(var player in playelist)
            {
                var num = player.SelectSingleNode("TD:eq(0)");
                var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
                var url = a.Attr("href");
                var playerid = Regex.Match(url, @"d+").Value;
                var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
                var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
                var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
                var height = player.SelectSingleNode("TD:eq(3)");
                var weight = player.SelectSingleNode("TD:eq(4)");
                var birth = player.SelectSingleNode("TD:eq(5)");
                var college = player.SelectSingleNode("TD:eq(6)");
                Logger.Log(playerimageurl.Text());
                Logger.Log(playername.Text());
                Logger.Log(position.Text());
                Logger.Log(height.Text());
                Logger.Log(weight.Text());
                Logger.Log(birth.Text());
                Logger.Log(college.Text());
                Logger.Log(playerimageurl.Attr("src"));
                Logger.Log(playerid);
                DataManager.AppendData("player",
                                       DataEntry.Create()
                                       .Set("playerid", playerid)
                                       .Set("teamid", teamid+"")
                                       .Set("playername", playername.Text())
                                       .Set("position", position.Text())
                                       .Set("height",height.Text())
                                       .Set("weight", weight.Text())
                                       .Set("birth", birth.Text())
                                       .Set("college", college.Text())
                                       .Set("num", num.Text())
                                       .Set("playerimageurl",playerimageurl.Attr("src"))
                                      );
            }
        }
    }

    比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

    脚本如下:

    public void Run()
    {
        Logger.ClearAll();
        Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
        Default.Ready();
        var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains("技术统计")");
    
        List<string> urls = new List<string>();
        foreach(var g in games)
        {
            var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
            urls.Add(url.ToString());
        }
        foreach(var url in urls)
        {
            if( Default.Available == false) return;
            Default.Navigate(url);
            Default.Ready();
            var teamNames = Default.SelectNodes("div.blockA>h2");
            var scores = Default.SelectNodes("table.tab04 tr");
            var scoreslist = Default.SelectNodes("table.tab02 tr>td");
            var awayscores = Default.SelectNodes("table.tab02 tr");
            var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
            var logos = Default.SelectNodes("td.logo img");
            var awayid =Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
            var homeid =Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
            var homescore=scores[1].Text();
            var awayscore=scores[0].Text();
            var awayscore1=scoreslist[0].Text();
            var awayscore2=scoreslist[1].Text();
            var awayscore3=scoreslist[2].Text();
            var awayscore4=scoreslist[3].Text();
            var homescore1=scoreslist[4].Text();
            var homescore2=scoreslist[5].Text();
            var homescore3=scoreslist[6].Text();
            var homescore4=scoreslist[7].Text();
            var gametime = Default.SelectSingleNode("div.center>h2");
    
            var jiashiawayscores1="";
            var jiashiawayscores2="" ;
            var jiashiawayscores3 ="";
            var jiashiawayscores4="";
            var jiashihomescores1="";
            var jiashihomescores2="";
            var jiashihomescores3 ="";
            var jiashihomescores4="";
    
            var td = Default.SelectSingleNode("table.tabBig td:contains("加时赛")");
            if(!td.IsEmpty())
            {
    
                if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==1)
                {
                    jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                    jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                }
                else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==2)
                {
                    jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                    jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                    jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                    jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
    
                }
                else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==3)
                {
                    jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                    jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                    jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
                    jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                    jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
                    jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
    
                }
                else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==4)
                {
                    jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                    jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                    jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
                    jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
                    jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                    jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
                    jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
                    jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text();
    
                }
    
    
            }
    
    
            DataManager.AppendData("GAMESTATIC",
                                   DataEntry.Create()
                                   .Set("teamid", Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
                                   .Set("gametime",gametime.Text().Replace("开始比赛",""))
                                   .Set("score1",awayscore1)
                                   .Set("score2", awayscore2)
                                   .Set("score3",awayscore3)
                                   .Set("score4",awayscore4)
                                   .Set("score", scores[1].Text())
                                   .Set("gameid",url)
                                   .Set("status", "0")
                                   .Set("jiashiscore1",jiashiawayscores1)
                                   .Set("jiashiscore2",jiashiawayscores2)
                                   .Set("jiashiscore3",jiashiawayscores3)
                                   .Set("jiashiscore4",jiashiawayscores4)
                                  );
            DataManager.AppendData("GAMESTATIC",
                                   DataEntry.Create()
                                   .Set("teamid", Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
                                   .Set("gametime",gametime.Text().Replace("开始比赛",""))
                                   .Set("score1",homescore1)
                                   .Set("score2", homescore2)
                                   .Set("score3",homescore3)
                                   .Set("score4",homescore4)
                                   .Set("score", scores[2].Text())
                                   .Set("gameid",url)
                                   .Set("status", "1")
                                   .Set("jiashiscore1",jiashihomescores1)
                                   .Set("jiashiscore2",jiashihomescores2)
                                   .Set("jiashiscore3",jiashihomescores3)
                                   .Set("jiashiscore4",jiashihomescores4)
                                  );
    
    
        }
    
    }

    这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的. 

    最后运行起来:

     

     

    文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果. 

    Spider Studio QQ群: 45995410

  • 相关阅读:
    leetcode python翻转字符串里的单词
    leetcode python快乐数
    Usb gadget驱动
    cnblogs的第一篇
    python返回函数+匿名函数+装饰器+偏函数
    1144. 递减元素使数组呈锯齿状
    208. Implement Trie (Prefix Tree)
    3. Longest Substring Without Repeating Characters
    5. Longest Palindromic Substring :manacher
    929. 独特的电子邮件地址
  • 原文地址:https://www.cnblogs.com/iamzyf/p/3446852.html
Copyright © 2020-2023  润新知