爬取目标网址 : http://music.163.com/#/song?id=409649818
需要爬取信息 : 网易云top13热评
使用之前的 HttpURLConnection 获取网页源码,经过分析发现,在源码中并没有热评信息
1 package bok; 2 3 import java.io.BufferedReader; 4 import java.io.InputStreamReader; 5 import java.net.HttpURLConnection; 6 import java.net.URL; 7 8 public class GC { 9 public static void main(String[] args) throws Exception{ 10 URL url = new URL("http://music.163.com/#/song?id=409649818") ; 11 HttpURLConnection httpURLConnection = (HttpURLConnection)url.openConnection() ; 12 String get = "" ; 13 if(httpURLConnection.getResponseCode()==200){ 14 BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"UTF-8")) ; 15 String read ; 16 while(((read=bufferedReader.readLine()))!=null){ 17 get+=read+=" " ; 18 } 19 System.out.println(get); 20 } 21 } 22 }
部分源码如下:
1 {/if} 2 {else} 3 <span class="u-icn u-icn-75"></span> 4 {/if} 5 </div> 6 </div> 7 </td> 8 <td class=""> 9 <div class="f-cb"> 10 <div class="tt"> 11 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}"> </span> 12 <div class="ttc"> 13 <span class="txt"> 14 {var alia=songAlia(x)} 15 <a href="/song?id=${x.id}"><b title="${x.name|escape}{if alia} - (${alia|escape}){/if}">${soil(x.name)}</b></a>{if alia}<span title="${alia|escape}" class="s-fc8"> - (${soil(alia)})</span>{/if} 16 {if x.mvid>0} 17 <span data-res-id="${x.id}" data-res-action="mv" title="播放mv" class="mv">MV</span> 18 {/if} 19 </span> 20 </div> 21 </div> 22 </div> 23 </td> 24 {/if} 25 <td class=" s-fc3"> 26 <span class="u-dur {if canDel}candel{/if}">${dur2time(x.duration/1000)}{if x.ftype==2}<i title="歌曲来自第三方网站" class="migu u-icn2 u-icn2-14"></i>{/if}</span> 27 <div class="opt hshow"> 28 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true" 29 data-res-type="18" 30 data-res-id="${x.id}" 31 data-res-action="addto" 32 {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a> 33 <span data-res-id="${x.id}" data-res-type="18" data-res-action="fav" class="icn icn-fav" title="收藏"></span> 34 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" {if x.album}data-res-pic="${x.album.picUrl}"{/if} class="icn icn-share" title="分享">分享</span> 35 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载"></span> 36 {if canDel} 37 <span data-res-id="${x.id}" data-res-type="18" data-res-action="delete" class="icn icn-del" title="删除">删除</span> 38 {/if} 39 </div> 40 </td> 41 <td class=""> 42 <div class="text" title="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}"> 43 ${getArtistName(x.artists, '', '', false, false, true)} 44 </div> 45 </td> 46 </tr> 47 {/list} 48 </tbody> 49 </table> 50 </textarea> 51 <textarea name="jst" id="m-wgt-song-pgm-list" style="display:none;"><table class="m-table m-table-prog"> 52 <tbody id="song-list"> 53 {list beg..end as y} 54 {var x=xlist[y]} 55 <tr id="${x.id|seed}" class="{if y%2!=0}even{/if} {if disable(x)}js-dis{/if}"> 56 <td class="first col1"> 57 <div class="hd"> 58 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}"> </span> 59 <span class="num">${y+1}</span> 60 </div> 61 </td> 62 <td class="col2"> 63 <div class="f-cb"> 64 <div class="tt"> 65 <div class="ttc"> 66 <span class="txt"> 67 {var alia=songAlia(x)} 68 <a href="/song?id=${x.id}"><b title="${x.name|escape}{if alia} - (${alia|escape}){/if}">${soil(x.name)}</b></a>{if alia}<span title="${alia|escape}" class="s-fc8"> - (${soil(alia)})</span>{/if} 69 {if x.mvid>0} 70 <span data-res-id="${x.id}" data-res-action="mv" title="播放mv" class="mv">MV</span> 71 {/if} 72 </span> 73 </div> 74 </div> 75 </div> 76 </td> 77 <td class="col3 s-fc3"> 78 <span class="u-dur {if canDel}candel{/if}">${dur2time(x.duration/1000)}{if x.ftype==2}<i title="歌曲来自第三方网站" class="migu u-icn2 u-icn2-14"></i>{/if}</span> 79 <div class="opt hshow"> 80 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true" 81 data-res-type="18" 82 data-res-id="${x.id}" 83 data-res-action="addto" 84 {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a> 85 <span data-res-id="${x.id}" data-res-type="18" data-res-action="fav" class="icn icn-fav" title="收藏"></span> 86 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" {if x.album}data-res-pic="${x.album.picUrl}"{/if} class="icn icn-share" title="分享">分享</span> 87 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载"></span> 88 {if canDel} 89 <span data-res-id="${x.id}" data-res-type="18" data-res-action="delete" class="icn icn-del" title="删除">删除</span> 90 {/if} 91 </div> 92 </td> 93 <td class="col4"> 94 <div class="text" title="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}"> 95 ${getArtistName(x.artists, '', '', false, false, true)} 96 </div> 97 </td> 98 <td class="col5"> 99 <div class="text"> 100 {if x.album} 101 <a href="/album?id=${x.album.id}" title="${x.album.name|escape}">${soil(x.album.name)}</a> 102 {/if} 103 </div> 104 </td> 105 </tr> 106 {/list} 107 </tbody> 108 </table> 109 </textarea> 110 <textarea name="jst" id="m-wgt-song-listen" style="display:none;"> <ul> 111 {list beg..end as y} 112 {var x=xlist[y]} 113 {if extData&&extData.limit&&y>=extData.limit} 114 {break} 115 {/if} 116 {var from=getFrom()} 117 <li id="${x.id|seed}" {if y%2 !=0 }class='even'{/if}> 118 <div class="hd "> 119 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}"> </span> 120 <span class="num">${y+1}.</span> 121 </div> 122 <div class="song"> 123 <div class="tt"> 124 <div class="ttc"> 125 <span class="txt"><a href="/song?id=${x.id}"><b title="${x.name}">${x.name}</b></a> 126 <span class='ar s-fc8'> <em>-</em> 127 ${getArtistName(x.artists, 's-fc8')} 128 </span> 129 </span> 130 </div> 131 </div> 132 <div class="opt"> 133 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true" data-res-type="18" data-res-id="${x.id}" data-res-action="addto" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a> 134 <span data-res-id="${x.id}" data-res-type="18" data-res-action="subscribe" class="icn icn-fav" title="收藏"></span> 135 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" class="icn icn-share" title="分享">分享</span> 136 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载">下载</span> 137 </div> 138 </div> 139 <div class="tops"> 140 <span class="bg" style='${x.score*100/x.max}%;'></span> 141 {if extData.showCount&&x.playCount}<span class="times f-ff2">${x.playCount}次</span>{/if} 142 </div> 143 </li> 144 {/list} 145 </ul> 146 {if extData&&extData.limit&&xlist.length>extData.limit} 147 <div class="more"> 148 <a href="/user/songs/rank?id=${hostId}" >查看更多></a> 149 </div> 150 {/if} 151 </textarea> 152 <textarea name="jst" id="m-wgt-purchased-song-list" style="display:none;"> {list beg..end as y} 153 {var x=xlist[y]} 154 <tr id="${x.id|seed}" class="{if y%2==1}even{/if} {if disable(x)}js-dis{/if}"> 155 <td class="left"> 156 <div class="hd {if type=='rank'}rank{/if}"> 157 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}"> </span> 158 <span class="num">${y+1}</span> 159 {if type=='rank'} 160 <div class="rk rk-1"> 161 {if x.lastRank>=0} 162 {if y-x.lastRank>0} 163 <span class="ico u-icn u-icn-74 s-fc10">${y-x.lastRank}</span> 164 {elseif y-x.lastRank==0} 165 <span class="ico u-icn u-icn-72 s-fc4">0</span> 166 {else} 167 <span class="ico u-icn u-icn-73 s-fc9">${x.lastRank-y}</span> 168 {/if} 169 {else} 170 <span class="u-icn u-icn-75"></span> 171 {/if} 172 </div> 173 {/if} 174 </div> 175 </td> 176 <td class="u-hasopt"> 177 <div class="f-cb"> 178 <div class="tt"> 179 <div class="ttc"> 180 <span class="txt"> 181 {var alia=songAlia(x)} 182 <a href="/song?id=${x.id}"><b title="${x.name|escape}{if alia} - (${alia|escape}){/if}">${soil(x.name)}</b></a>{if alia}<span title="${alia|escape}" class="s-fc8"> - (${soil(alia)})</span>{/if} 183 {if x.mvid>0} 184 <span data-res-id="${x.id}" data-res-action="mv" title="播放mv" class="mv">MV</span> 185 {/if} 186 </span> 187 </div> 188 </div> 189 <div class="opt hshow"> 190 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true" 191 data-res-type="18" 192 data-res-id="${x.id}" 193 data-res-action="addto" 194 {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a> 195 <span data-res-id="${x.id}" data-res-type="18" data-res-action="fav" class="icn icn-fav" title="收藏"></span> 196 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" {if x.album}data-res-pic="${x.album.picUrl}"{/if} class="icn icn-share" title="分享">分享</span> 197 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载"></span> 198 {if canDel} 199 <span data-res-id="${x.id}" data-res-type="18" data-res-action="delete" class="icn icn-del" title="删除">删除</span> 200 {/if} 201 </div> 202 </div> 203 </td> 204 <td class=""> 205 <div class="text" title="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}"> 206 ${getArtistName(x.artists, '', '', false, false, true)} 207 </div> 208 </td> 209 <td class=""> 210 <div class="text"> 211 {if x.album} 212 <a href="/album?id=${x.album.id}" title="${x.album.name|escape}">${soil(x.album.name)}</a> 213 {/if} 214 </div> 215 </td> 216 <td class="s-fc3">${formatTime(x.paidTime)}</td> 217 </tr> 218 {/list} 219 </textarea> 220 <textarea name="ntp" id="m-msg-private-send" style="display:none;"><div class="lyct lyct-1 f-cb"> 221 <div class="m-lyshare m-plshare"> 222 <div class="u-err j-flag" style="display: none;">最多选择10位好友</div> 223 <div class="item item-1 f-cb"> 224 <label>发 给:</label> 225 <div class="ct f-pr j-flag"> 226 </div> 227 </div> 228 <div class="item f-cb"> 229 <label>内 容:</label> 230 <div class="ct j-flag"> 231 </div> 232 </div> 233 </div> 234 </div> 235 </textarea> 236 <textarea name="jst" id="m-wgt-redeem-tip" style="display:none;"><div class="lyct"> 237 <div class="result f-tc"> 238 <div class="text"> 239 <h4 class="f-fs2"><i class="icn u-icn2 u-icn2-{if type=='error'}16{else}15{/if}"></i>${title}</h4> 240 <p class="f-fs1">${sub}</p> 241 </div> 242 <div class="btnwrap {if ok&&cc}btnwrap-1{/if}"> 243 {if ok} 244 <a data-action="ok" href="javascript:;" class="u-btn2 u-btn2-2 {if ok.length <= 3}u-btn2-w2{/if}" hidefocus="true"><i>${ok}</i></a> 245 {/if} 246 {if cc} 247 <a data-action="cc" href="javascript:;" class="u-btn2 u-btn2-1 u-btn2-w2" hidefocus="true"><i>${cc}</i></a> 248 {/if} 249 </div> 250 </div> 251 </div> 252 </textarea> 253 </div> 254 <script src="//s3.music.126.net/sep/s/2/core.js?88f5bc0082242aff627ec140af2072b3" type="text/javascript"></script><script src="//s3.music.126.net/sep/s/2/pt_frame_index.js?56ef55a585c894efc0e5d564ccbe4f44" type="text/javascript"></script> 255 </body> 256 <script type="text/javascript"> 257 var _gaq = _gaq || []; 258 _gaq.push(['_setAccount', 'UA-38766552-1'],['_setLocalGifPath', '/UA-38766552-1/__utm.gif'],['_setLocalRemoteServerMode']); 259 _gaq.push(['_trackPageview']); 260 (function() { 261 var ga = document.createElement('script'); 262 ga.type = 'text/javascript'; 263 ga.async = true; 264 ga.src = '//wr.da.netease.com/ga.js'; 265 var s = document.getElementsByTagName('script')[0]; 266 s.parentNode.insertBefore(ga, s); 267 })();//fix ipad下的一个bug 268 if (navigator.userAgent.indexOf('iPad') != -1) { 269 iframeHeight = Math.max( 270 Math.max(document.body.scrollHeight, document.documentElement.scrollHeight), 271 Math.max(document.body.offsetHeight, document.documentElement.offsetHeight), 272 Math.max(document.body.clientHeight, document.documentElement.clientHeight) 273 ); 274 top.document.body.style.height = iframeHeight + 20 + 'px'; 275 }</script> 276 </html>
获取的源码中既然没有热评信息
只有通过 F12 -> NetWork 分析网络请求
可以发现
有关热评信息的请求是http://music.163.com/weapi/v1/resource/comments/R_SO_4_409649818?csrf_token=
409649818 是歌曲ID
且表单数据与歌曲无关,是一段关于本机Cookie的信息,所以只需要一种表单数据,即可用来实现不同歌曲的请求
基本代码如下:
1 package 网易云热评爬取; 2 3 import org.apache.http.HttpEntity; 4 import org.apache.http.NameValuePair; 5 import org.apache.http.client.entity.UrlEncodedFormEntity; 6 import org.apache.http.client.methods.CloseableHttpResponse; 7 import org.apache.http.client.methods.HttpGet; 8 import org.apache.http.client.methods.HttpPost; 9 import org.apache.http.impl.client.CloseableHttpClient; 10 import org.apache.http.impl.client.HttpClients; 11 import org.apache.http.message.BasicNameValuePair; 12 import org.apache.http.util.EntityUtils; 13 import java.util.ArrayList; 14 import java.util.List; 15 import java.util.regex.Matcher; 16 import java.util.regex.Pattern; 17 18 public class MyClawer { 19 public static void printHot(String u) throws Exception{ 20 CloseableHttpClient closeableHttpClient = HttpClients.createDefault() ; 21 HttpPost httpPost = new HttpPost(u) ; 22 httpPost.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"); 23 24 List<NameValuePair> list=new ArrayList<NameValuePair>(); 25 list.add(new BasicNameValuePair("params","RlBC7U1bfy/boPwg9ag7/a7AjkQOgsIfd+vsUjoMY2tyQCPFgnNoxHeCY+ZuHYqtM1zF8DWIBwJWbsCOQ6ZYxBiPE3bk+CI1U6Htoc4P9REBePlaiuzU4M3rDAxtMfNN3y0eimeq3LVo28UoarXs2VMWkCqoTXSi5zgKEKbxB7CmlBJAP9pn1aC+e3+VOTr0")); 26 list.add(new BasicNameValuePair("encSecKey","76a0d8ff9f6914d4f59be6b3e1f5d1fc3998317195464f00ee704149bc6672c587cd4a37471e3a777cb283a971d6b9205ce4a7187e682bdaefc0f225fb9ed1319f612243096823ddec88b6d6ea18f3fec883d2489d5a1d81cb5dbd0602981e7b49db5543b3d9edb48950e113f3627db3ac61cbc71d811889d68ff95d0eba04e9")); 27 28 httpPost.setEntity(new UrlEncodedFormEntity(list)); 29 CloseableHttpResponse response=closeableHttpClient.execute(httpPost); 30 31 HttpEntity entity=response.getEntity(); 32 String ux = EntityUtils.toString(entity,"utf-8") ; 33 //System.out.println(ux); 34 ArrayList<String> s= getBook(ux); 35 36 for(int i=0;i<s.size();i++){ 37 String []arr = s.get(i).split(""") ; 38 System.out.println(arr[2]); 39 } 40 } 41 public static void main(String[] args) throws Exception { 42 String u = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_409649818?csrf_token=" ; 43 printHot(u); 44 } 45 46 public static ArrayList getBook(String read){ 47 ArrayList<String> arrayList = new ArrayList<String>() ; 48 49 String con = "content(.*?)"}" ; 50 Pattern ah = Pattern.compile(con); 51 Matcher mr = ah.matcher(read); 52 while(mr.find()) { 53 if (!arrayList.contains(mr.group())) { 54 arrayList.add(mr.group()); 55 } 56 } 57 return arrayList ; 58 } 59 }
运行结果: