最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:
球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30
在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:
public void Run() { Logger.ClearAll(); for(int i=1; i<=30; i++) { Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i); Default.Ready(); var teamid = i; var teamname = Default.SelectSingleNode("div.blockA>h2>span"); Logger.Log(teamname.Text()); var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a"); Logger.Log(teamurl.Text()); var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)"); Logger.Log(teamcity.Text().Replace("主场所在城市:","")); var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)"); Logger.Log(gym.Text().Replace("主体育馆:","")); var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)"); Logger.Log(peoplenum.Text().Replace("可容纳人数:","")); var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)"); Logger.Log(intonba.Text().Replace("加入NBA时间:","")); var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)"); Logger.Log(champion.Text().Replace("获总冠军次数:","")); var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)"); Logger.Log(coach.Text().Replace("现任主教练:","")); DataManager.AppendData("TEAM", DataEntry.Create() .Set("teamid", teamid+"") .Set("teamname", teamname.Text()) .Set("teamurl", teamurl.Text().Replace("主场所在城市:","")) .Set("gym",gym.Text().Replace("主体育馆:","")) .Set("peoplenum", peoplenum.Text().Replace("可容纳人数:","")) .Set("intonba", intonba.Text().Replace("加入NBA时间:","")) .Set("champion", champion.Text().Replace("获总冠军次数:","")) .Set("coach", coach.Text().Replace("现任主教练:","")) ); Logger.Log(i.ToString()); var playelist = Default.SelectNodes("div.tab>table tr"); foreach(var player in playelist) { var num = player.SelectSingleNode("TD:eq(0)"); var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a"); var url = a.Attr("href"); var playerid = Regex.Match(url, @"d+").Value; var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img"); var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A"); var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)"); var height = player.SelectSingleNode("TD:eq(3)"); var weight = player.SelectSingleNode("TD:eq(4)"); var birth = player.SelectSingleNode("TD:eq(5)"); var college = player.SelectSingleNode("TD:eq(6)"); Logger.Log(playerimageurl.Text()); Logger.Log(playername.Text()); Logger.Log(position.Text()); Logger.Log(height.Text()); Logger.Log(weight.Text()); Logger.Log(birth.Text()); Logger.Log(college.Text()); Logger.Log(playerimageurl.Attr("src")); Logger.Log(playerid); DataManager.AppendData("player", DataEntry.Create() .Set("playerid", playerid) .Set("teamid", teamid+"") .Set("playername", playername.Text()) .Set("position", position.Text()) .Set("height",height.Text()) .Set("weight", weight.Text()) .Set("birth", birth.Text()) .Set("college", college.Text()) .Set("num", num.Text()) .Set("playerimageurl",playerimageurl.Attr("src")) ); } } }
比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012
脚本如下:
public void Run() { Logger.ClearAll(); Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012"); Default.Ready(); var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains("技术统计")"); List<string> urls = new List<string>(); foreach(var g in games) { var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString(); urls.Add(url.ToString()); } foreach(var url in urls) { if( Default.Available == false) return; Default.Navigate(url); Default.Ready(); var teamNames = Default.SelectNodes("div.blockA>h2"); var scores = Default.SelectNodes("table.tab04 tr"); var scoreslist = Default.SelectNodes("table.tab02 tr>td"); var awayscores = Default.SelectNodes("table.tab02 tr"); var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)"); var logos = Default.SelectNodes("td.logo img"); var awayid =Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value; var homeid =Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value; var homescore=scores[1].Text(); var awayscore=scores[0].Text(); var awayscore1=scoreslist[0].Text(); var awayscore2=scoreslist[1].Text(); var awayscore3=scoreslist[2].Text(); var awayscore4=scoreslist[3].Text(); var homescore1=scoreslist[4].Text(); var homescore2=scoreslist[5].Text(); var homescore3=scoreslist[6].Text(); var homescore4=scoreslist[7].Text(); var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1=""; var jiashiawayscores2="" ; var jiashiawayscores3 =""; var jiashiawayscores4=""; var jiashihomescores1=""; var jiashihomescores2=""; var jiashihomescores3 =""; var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains("加时赛")"); if(!td.IsEmpty()) { if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==1) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); } else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==2) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); } else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==3) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text(); jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); } else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==4) { jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text(); jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text(); jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text(); jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text(); jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text(); jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC", DataEntry.Create() .Set("teamid", Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value) .Set("gametime",gametime.Text().Replace("开始比赛","")) .Set("score1",awayscore1) .Set("score2", awayscore2) .Set("score3",awayscore3) .Set("score4",awayscore4) .Set("score", scores[1].Text()) .Set("gameid",url) .Set("status", "0") .Set("jiashiscore1",jiashiawayscores1) .Set("jiashiscore2",jiashiawayscores2) .Set("jiashiscore3",jiashiawayscores3) .Set("jiashiscore4",jiashiawayscores4) ); DataManager.AppendData("GAMESTATIC", DataEntry.Create() .Set("teamid", Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value) .Set("gametime",gametime.Text().Replace("开始比赛","")) .Set("score1",homescore1) .Set("score2", homescore2) .Set("score3",homescore3) .Set("score4",homescore4) .Set("score", scores[2].Text()) .Set("gameid",url) .Set("status", "1") .Set("jiashiscore1",jiashihomescores1) .Set("jiashiscore2",jiashihomescores2) .Set("jiashiscore3",jiashihomescores3) .Set("jiashiscore4",jiashihomescores4) ); } }
这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的.
最后运行起来:
文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果.
Spider Studio QQ群: 45995410