• php爬虫 curl 拼多多 京东评论采集


    PDD评论:需要登录,需要添加头信息
    AccessToken $header[] = 'AccessToken:';

    http://apiv4.yangkeduo.com/reviews/'.$goods_id.'/list?size=10&page='.$page
    JD评论:
    https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId='.$goods_id.'&score='.$score.'&sortType='.$sortType.'&page='.$i.'&pageSize=10&isShadowSku=0&rid=0&fold=1
    $sortType = 5;//排序
    $score = 3;//0全部,1 差评,2中评,3好评,4带图评论,5追评

    public function spider(){
            ini_set("display_errors", "On");//打开错误提示
            ini_set("error_reporting",E_ALL);//显示所有错误
            header("Content-Type: text/html; charset=utf-8");
    
            $header = $this->header();
            $header[] = 'Referer: https://item.jd.com/4995961.html';
    
           //设置浏览器信息
            $header[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36';
    
            $url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=4995961&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&rid=0&fold=1';
    
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
            curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
            curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
        //为防止爬取多次禁用Ip,可用代理ip
    //            curl_setopt($ch, CURLOPT_PROXY,'88.198.50.103'); //代理服务器地址
    //            curl_setopt($ch, CURLOPT_PROXYPORT, '8080'); //代理服务器端口
    
            $output = curl_exec($ch);
            curl_close($ch);
    
            $encode = mb_detect_encoding($output, array("ASCII",'UTF-8',"GB2312","GBK",'BIG5'));
            if($encode == 'UTF-8'){
                echo $encode;
            }else{
                $output = mb_convert_encoding($output, 'UTF-8', $encode);
            }
            $result = json_decode($output, true);
    
    
    }
    
    
    //此函数提供了国内的IP地址
        public static function header(){
           $ip_long = array(
               array('607649792', '608174079'), //36.56.0.0-36.63.255.255
               array('1038614528', '1039007743'), //61.232.0.0-61.237.255.255
               array('1783627776', '1784676351'), //106.80.0.0-106.95.255.255
               array('2035023872', '2035154943'), //121.76.0.0-121.77.255.255
               array('2078801920', '2079064063'), //123.232.0.0-123.235.255.255
               array('-1950089216', '-1948778497'), //139.196.0.0-139.215.255.255
               array('-1425539072', '-1425014785'), //171.8.0.0-171.15.255.255
               array('-1236271104', '-1235419137'), //182.80.0.0-182.92.255.255
               array('-770113536', '-768606209'), //210.25.0.0-210.47.255.255
               array('-569376768', '-564133889'), //222.16.0.0-222.95.255.255
           );
           $rand_key = mt_rand(0, 9);
           $ip= long2ip(mt_rand($ip_long[$rand_key][0], $ip_long[$rand_key][1]));
    
            $headers['CLIENT-IP'] =$ip;
            $headers['X-FORWARDED-FOR'] =$ip;
            $headers["VIA"] = $ip;
            $headers["REMOTE_ADDR"] = $ip;
    
    //        $header[] = 'Referer: https://item.jd.com/'.$goods_id.'.html';
    
           $headerArr = array();
           foreach($headers as $n => $v ) {
               $headerArr[] = $n .': ' . $v;
           }
           return $headerArr;
       }
    

      

  • 相关阅读:
    Hadoop生态圈-Hive快速入门篇之HQL的基础语法
    Hadoop生态圈-Hive快速入门篇之Hive环境搭建
    Hadoop生态圈-zookeeper的API用法详解
    Hadoop生态圈-zookeeper完全分布式部署
    Hadoop基础-MapReduce的工作原理第一弹
    Hadoop基础-HDFS的读取与写入过程
    java基础-回调函数(callback)
    Hadoop基础-网络拓扑机架感知及其实现
    Hadoop基础-HDFS数据清理过程之校验过程代码分析
    Hadoop基础-Protocol Buffers串行化与反串行化
  • 原文地址:https://www.cnblogs.com/jimz/p/14171483.html
Copyright © 2020-2023  润新知