• Java爬虫——网易云热评爬取


    爬取目标网址 :   http://music.163.com/#/song?id=409649818

    需要爬取信息 :   网易云top13热评

    使用之前的 HttpURLConnection 获取网页源码,经过分析发现,在源码中并没有热评信息

     1 package bok;
     2 
     3 import java.io.BufferedReader;
     4 import java.io.InputStreamReader;
     5 import java.net.HttpURLConnection;
     6 import java.net.URL;
     7 
     8 public class GC {
     9     public static void main(String[] args) throws Exception{
    10         URL url = new URL("http://music.163.com/#/song?id=409649818") ;
    11         HttpURLConnection httpURLConnection = (HttpURLConnection)url.openConnection() ;
    12         String get = "" ;
    13         if(httpURLConnection.getResponseCode()==200){
    14             BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"UTF-8")) ;
    15             String read ;
    16             while(((read=bufferedReader.readLine()))!=null){
    17                 get+=read+="
    " ;
    18             }
    19             System.out.println(get);
    20         }
    21     }
    22 }

    部分源码如下:

      1 {/if}
      2 {else}
      3 <span class="u-icn u-icn-75"></span>
      4 {/if}
      5 </div>
      6 </div>
      7 </td>
      8 <td class="">
      9 <div class="f-cb">
     10 <div class="tt">
     11 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}">&nbsp;</span>
     12 <div class="ttc">
     13 <span class="txt">
     14 {var alia=songAlia(x)}
     15 <a href="/song?id=${x.id}"><b title="${x.name|escape}{if alia} - (${alia|escape}){/if}">${soil(x.name)}</b></a>{if alia}<span title="${alia|escape}" class="s-fc8"> - (${soil(alia)})</span>{/if}
     16 {if x.mvid>0}
     17 <span data-res-id="${x.id}" data-res-action="mv" title="播放mv" class="mv">MV</span>
     18 {/if}
     19 </span>
     20 </div>
     21 </div>
     22 </div>
     23 </td>
     24 {/if}
     25 <td class=" s-fc3">
     26 <span class="u-dur {if canDel}candel{/if}">${dur2time(x.duration/1000)}{if x.ftype==2}<i title="歌曲来自第三方网站" class="migu u-icn2 u-icn2-14"></i>{/if}</span>
     27 <div class="opt hshow">
     28 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true"
     29 data-res-type="18"
     30 data-res-id="${x.id}"
     31 data-res-action="addto"
     32 {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a>
     33 <span data-res-id="${x.id}" data-res-type="18" data-res-action="fav" class="icn icn-fav" title="收藏"></span>
     34 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" {if x.album}data-res-pic="${x.album.picUrl}"{/if} class="icn icn-share" title="分享">分享</span>
     35 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载"></span>
     36 {if canDel}
     37 <span data-res-id="${x.id}" data-res-type="18" data-res-action="delete" class="icn icn-del" title="删除">删除</span>
     38 {/if}
     39 </div>
     40 </td>
     41 <td class="">
     42 <div class="text" title="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}">
     43 ${getArtistName(x.artists, '', '', false, false, true)}
     44 </div>
     45 </td>
     46 </tr>
     47 {/list}
     48 </tbody>
     49 </table>
     50 </textarea>
     51 <textarea name="jst" id="m-wgt-song-pgm-list" style="display:none;"><table class="m-table m-table-prog">
     52 <tbody id="song-list">
     53 {list beg..end as y}
     54 {var x=xlist[y]}
     55 <tr id="${x.id|seed}" class="{if y%2!=0}even{/if} {if disable(x)}js-dis{/if}">
     56 <td class="first col1">
     57 <div class="hd">
     58 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}">&nbsp;</span>
     59 <span class="num">${y+1}</span>
     60 </div>
     61 </td>
     62 <td class="col2">
     63 <div class="f-cb">
     64 <div class="tt">
     65 <div class="ttc">
     66 <span class="txt">
     67 {var alia=songAlia(x)}
     68 <a href="/song?id=${x.id}"><b title="${x.name|escape}{if alia} - (${alia|escape}){/if}">${soil(x.name)}</b></a>{if alia}<span title="${alia|escape}" class="s-fc8"> - (${soil(alia)})</span>{/if}
     69 {if x.mvid>0}
     70 <span data-res-id="${x.id}" data-res-action="mv" title="播放mv" class="mv">MV</span>
     71 {/if}
     72 </span>
     73 </div>
     74 </div>
     75 </div>
     76 </td>
     77 <td class="col3 s-fc3">
     78 <span class="u-dur {if canDel}candel{/if}">${dur2time(x.duration/1000)}{if x.ftype==2}<i title="歌曲来自第三方网站" class="migu u-icn2 u-icn2-14"></i>{/if}</span>
     79 <div class="opt hshow">
     80 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true"
     81 data-res-type="18"
     82 data-res-id="${x.id}"
     83 data-res-action="addto"
     84 {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a>
     85 <span data-res-id="${x.id}" data-res-type="18" data-res-action="fav" class="icn icn-fav" title="收藏"></span>
     86 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" {if x.album}data-res-pic="${x.album.picUrl}"{/if} class="icn icn-share" title="分享">分享</span>
     87 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载"></span>
     88 {if canDel}
     89 <span data-res-id="${x.id}" data-res-type="18" data-res-action="delete" class="icn icn-del" title="删除">删除</span>
     90 {/if}
     91 </div>
     92 </td>
     93 <td class="col4">
     94 <div class="text" title="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}">
     95 ${getArtistName(x.artists, '', '', false, false, true)}
     96 </div>
     97 </td>
     98 <td class="col5">
     99 <div class="text">
    100 {if x.album}
    101 <a href="/album?id=${x.album.id}" title="${x.album.name|escape}">${soil(x.album.name)}</a>
    102 {/if}
    103 </div>
    104 </td>
    105 </tr>
    106 {/list}
    107 </tbody>
    108 </table>
    109 </textarea>
    110 <textarea name="jst" id="m-wgt-song-listen" style="display:none;"> <ul>
    111 {list beg..end as y}
    112 {var x=xlist[y]}
    113 {if extData&&extData.limit&&y>=extData.limit}
    114 {break}
    115 {/if}
    116 {var from=getFrom()}
    117 <li id="${x.id|seed}" {if y%2 !=0 }class='even'{/if}>
    118 <div class="hd ">
    119 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}">&nbsp;</span>
    120 <span class="num">${y+1}.</span>
    121 </div>
    122 <div class="song">
    123 <div class="tt">
    124 <div class="ttc">
    125 <span class="txt"><a href="/song?id=${x.id}"><b title="${x.name}">${x.name}</b></a>
    126 <span class='ar s-fc8'> <em>-</em>
    127 ${getArtistName(x.artists, 's-fc8')}
    128 </span>
    129 </span>
    130 </div>
    131 </div>
    132 <div class="opt">
    133 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true" data-res-type="18" data-res-id="${x.id}" data-res-action="addto" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a>
    134 <span data-res-id="${x.id}" data-res-type="18" data-res-action="subscribe" class="icn icn-fav" title="收藏"></span>
    135 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" class="icn icn-share" title="分享">分享</span>
    136 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载">下载</span>
    137 </div>
    138 </div>
    139 <div class="tops">
    140 <span class="bg" style='${x.score*100/x.max}%;'></span>
    141 {if extData.showCount&&x.playCount}<span class="times f-ff2">${x.playCount}次</span>{/if}
    142 </div>
    143 </li>
    144 {/list}
    145 </ul>
    146 {if extData&&extData.limit&&xlist.length>extData.limit}
    147 <div class="more">
    148 <a href="/user/songs/rank?id=${hostId}" >查看更多&gt;</a>
    149 </div>
    150 {/if}
    151 </textarea>
    152 <textarea name="jst" id="m-wgt-purchased-song-list" style="display:none;"> {list beg..end as y}
    153 {var x=xlist[y]}
    154 <tr id="${x.id|seed}" class="{if y%2==1}even{/if} {if disable(x)}js-dis{/if}">
    155 <td class="left">
    156 <div class="hd {if type=='rank'}rank{/if}">
    157 <span data-res-id="${x.id}" data-res-type="18" data-res-action="play" {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if} class="ply {if isPlaying(x)}ply-z-slt{/if}">&nbsp;</span>
    158 <span class="num">${y+1}</span>
    159 {if type=='rank'}
    160 <div class="rk rk-1">
    161 {if x.lastRank>=0}
    162 {if y-x.lastRank>0}
    163 <span class="ico u-icn u-icn-74 s-fc10">${y-x.lastRank}</span>
    164 {elseif y-x.lastRank==0}
    165 <span class="ico u-icn u-icn-72 s-fc4">0</span>
    166 {else}
    167 <span class="ico u-icn u-icn-73 s-fc9">${x.lastRank-y}</span>
    168 {/if}
    169 {else}
    170 <span class="u-icn u-icn-75"></span>
    171 {/if}
    172 </div>
    173 {/if}
    174 </div>
    175 </td>
    176 <td class="u-hasopt">
    177 <div class="f-cb">
    178 <div class="tt">
    179 <div class="ttc">
    180 <span class="txt">
    181 {var alia=songAlia(x)}
    182 <a href="/song?id=${x.id}"><b title="${x.name|escape}{if alia} - (${alia|escape}){/if}">${soil(x.name)}</b></a>{if alia}<span title="${alia|escape}" class="s-fc8"> - (${soil(alia)})</span>{/if}
    183 {if x.mvid>0}
    184 <span data-res-id="${x.id}" data-res-action="mv" title="播放mv" class="mv">MV</span>
    185 {/if}
    186 </span>
    187 </div>
    188 </div>
    189 <div class="opt hshow">
    190 <a class="u-icn u-icn-81 icn-add" href="javascript:;" title="添加到播放列表" hidefocus="true"
    191 data-res-type="18"
    192 data-res-id="${x.id}"
    193 data-res-action="addto"
    194 {if from}data-res-from="${from.fid}" data-res-data="${from.fdata}"{/if}></a>
    195 <span data-res-id="${x.id}" data-res-type="18" data-res-action="fav" class="icn icn-fav" title="收藏"></span>
    196 <span data-res-id="${x.id}" data-res-type="18" data-res-action="share" data-res-name="${x.name}" data-res-author="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}" {if x.album}data-res-pic="${x.album.picUrl}"{/if} class="icn icn-share" title="分享">分享</span>
    197 <span data-res-id="${x.id}" data-res-type="18" data-res-action="download" class="icn icn-dl" title="下载"></span>
    198 {if canDel}
    199 <span data-res-id="${x.id}" data-res-type="18" data-res-action="delete" class="icn icn-del" title="删除">删除</span>
    200 {/if}
    201 </div>
    202 </div>
    203 </td>
    204 <td class="">
    205 <div class="text" title="{list x.artists as art}${art.name}{if art_index<x.artists.length-1}/{/if}{/list}">
    206 ${getArtistName(x.artists, '', '', false, false, true)}
    207 </div>
    208 </td>
    209 <td class="">
    210 <div class="text">
    211 {if x.album}
    212 <a href="/album?id=${x.album.id}" title="${x.album.name|escape}">${soil(x.album.name)}</a>
    213 {/if}
    214 </div>
    215 </td>
    216 <td class="s-fc3">${formatTime(x.paidTime)}</td>
    217 </tr>
    218 {/list}
    219 </textarea>
    220 <textarea name="ntp" id="m-msg-private-send" style="display:none;"><div class="lyct lyct-1 f-cb">
    221 <div class="m-lyshare m-plshare">
    222 <div class="u-err j-flag" style="display: none;">最多选择10位好友</div>
    223 <div class="item item-1 f-cb">
    224 <label>发 给:</label>
    225 <div class="ct f-pr j-flag">
    226 </div>
    227 </div>
    228 <div class="item f-cb">
    229 <label>内 容:</label>
    230 <div class="ct j-flag">
    231 </div>
    232 </div>
    233 </div>
    234 </div>
    235 </textarea>
    236 <textarea name="jst" id="m-wgt-redeem-tip" style="display:none;"><div class="lyct">
    237 <div class="result f-tc">
    238 <div class="text">
    239 <h4 class="f-fs2"><i class="icn u-icn2 u-icn2-{if type=='error'}16{else}15{/if}"></i>${title}</h4>
    240 <p class="f-fs1">${sub}</p>
    241 </div>
    242 <div class="btnwrap {if ok&&cc}btnwrap-1{/if}">
    243 {if ok}
    244 <a data-action="ok" href="javascript:;" class="u-btn2 u-btn2-2 {if ok.length <= 3}u-btn2-w2{/if}" hidefocus="true"><i>${ok}</i></a>
    245 {/if}
    246 {if cc}
    247 <a data-action="cc" href="javascript:;" class="u-btn2 u-btn2-1 u-btn2-w2" hidefocus="true"><i>${cc}</i></a>
    248 {/if}
    249 </div>
    250 </div>
    251 </div>
    252 </textarea>
    253 </div>
    254 <script src="//s3.music.126.net/sep/s/2/core.js?88f5bc0082242aff627ec140af2072b3" type="text/javascript"></script><script src="//s3.music.126.net/sep/s/2/pt_frame_index.js?56ef55a585c894efc0e5d564ccbe4f44" type="text/javascript"></script>
    255 </body>
    256 <script type="text/javascript">
    257 var _gaq = _gaq || [];
    258 _gaq.push(['_setAccount', 'UA-38766552-1'],['_setLocalGifPath', '/UA-38766552-1/__utm.gif'],['_setLocalRemoteServerMode']);
    259 _gaq.push(['_trackPageview']);
    260 (function() {
    261 var ga = document.createElement('script');
    262 ga.type = 'text/javascript';
    263 ga.async = true;
    264 ga.src = '//wr.da.netease.com/ga.js';
    265 var s = document.getElementsByTagName('script')[0];
    266 s.parentNode.insertBefore(ga, s);
    267 })();//fix ipad下的一个bug
    268 if (navigator.userAgent.indexOf('iPad') != -1) {
    269 iframeHeight = Math.max(
    270 Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
    271 Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
    272 Math.max(document.body.clientHeight, document.documentElement.clientHeight)
    273 );
    274 top.document.body.style.height = iframeHeight + 20 + 'px';
    275 }</script>
    276 </html>
    View Code

    获取的源码中既然没有热评信息

    只有通过 F12 -> NetWork 分析网络请求

    可以发现

    有关热评信息的请求是http://music.163.com/weapi/v1/resource/comments/R_SO_4_409649818?csrf_token=

    409649818 是歌曲ID  

    且表单数据与歌曲无关,是一段关于本机Cookie的信息,所以只需要一种表单数据,即可用来实现不同歌曲的请求

    基本代码如下:

     1 package 网易云热评爬取;
     2 
     3 import org.apache.http.HttpEntity;
     4 import org.apache.http.NameValuePair;
     5 import org.apache.http.client.entity.UrlEncodedFormEntity;
     6 import org.apache.http.client.methods.CloseableHttpResponse;
     7 import org.apache.http.client.methods.HttpGet;
     8 import org.apache.http.client.methods.HttpPost;
     9 import org.apache.http.impl.client.CloseableHttpClient;
    10 import org.apache.http.impl.client.HttpClients;
    11 import org.apache.http.message.BasicNameValuePair;
    12 import org.apache.http.util.EntityUtils;
    13 import java.util.ArrayList;
    14 import java.util.List;
    15 import java.util.regex.Matcher;
    16 import java.util.regex.Pattern;
    17 
    18 public class MyClawer {
    19     public static void printHot(String u) throws Exception{
    20         CloseableHttpClient closeableHttpClient = HttpClients.createDefault() ;
    21         HttpPost httpPost = new HttpPost(u) ;
    22         httpPost.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36");
    23 
    24         List<NameValuePair> list=new ArrayList<NameValuePair>();
    25         list.add(new BasicNameValuePair("params","RlBC7U1bfy/boPwg9ag7/a7AjkQOgsIfd+vsUjoMY2tyQCPFgnNoxHeCY+ZuHYqtM1zF8DWIBwJWbsCOQ6ZYxBiPE3bk+CI1U6Htoc4P9REBePlaiuzU4M3rDAxtMfNN3y0eimeq3LVo28UoarXs2VMWkCqoTXSi5zgKEKbxB7CmlBJAP9pn1aC+e3+VOTr0"));
    26         list.add(new BasicNameValuePair("encSecKey","76a0d8ff9f6914d4f59be6b3e1f5d1fc3998317195464f00ee704149bc6672c587cd4a37471e3a777cb283a971d6b9205ce4a7187e682bdaefc0f225fb9ed1319f612243096823ddec88b6d6ea18f3fec883d2489d5a1d81cb5dbd0602981e7b49db5543b3d9edb48950e113f3627db3ac61cbc71d811889d68ff95d0eba04e9"));
    27 
    28         httpPost.setEntity(new UrlEncodedFormEntity(list));
    29         CloseableHttpResponse response=closeableHttpClient.execute(httpPost);
    30 
    31         HttpEntity entity=response.getEntity();
    32         String ux = EntityUtils.toString(entity,"utf-8") ;
    33         //System.out.println(ux);
    34         ArrayList<String> s= getBook(ux);
    35 
    36         for(int i=0;i<s.size();i++){
    37             String []arr = s.get(i).split(""") ;
    38             System.out.println(arr[2]);
    39         }
    40     }
    41     public static void main(String[] args) throws Exception {
    42         String u = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_409649818?csrf_token=" ;
    43         printHot(u);
    44     }
    45 
    46     public static ArrayList getBook(String read){
    47         ArrayList<String> arrayList = new ArrayList<String>() ;
    48 
    49         String con = "content(.*?)"}" ;
    50         Pattern ah = Pattern.compile(con);
    51         Matcher mr = ah.matcher(read);
    52         while(mr.find()) {
    53             if (!arrayList.contains(mr.group())) {
    54                 arrayList.add(mr.group());
    55             }
    56         }
    57         return  arrayList ;
    58     }
    59 }

    运行结果:

  • 相关阅读:
    ViewState
    Insert
    Copy: 了解SQL Server锁争用:NOLOCK 和 ROWLOCK 的秘密
    How to check number of Active connections in SQL server?
    (转)IIS6上启用Gzip压缩(HTTP压缩) 详解
    Reference (SQL Server 2005自动异机备份)
    [转] ios中KeyChain用途
    ios 中的事件
    unity3d 鼠标事件穿透GUI界面的问题
    Unity StartCoroutine注意的问题
  • 原文地址:https://www.cnblogs.com/LexMoon/p/javaWy.html
Copyright © 2020-2023  润新知