• 使用CURL和火车头软件采集搜狐文章


    直接上代码:

    //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies
    function curl_request($url,$post='',$cookie='', $returnCookie=0){
    	$curl = curl_init();
    	curl_setopt($curl, CURLOPT_URL, $url);
    	curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');
    	curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
    	curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
    	curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    	curl_setopt($curl, CURLOPT_REFERER, "http://www.baidu.com/");
    	if($post) {
    		curl_setopt($curl, CURLOPT_POST, 1);
    		curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
    	}
    	if($cookie) {
    		curl_setopt($curl, CURLOPT_COOKIE, $cookie);
    	}
    	curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
    	curl_setopt($curl, CURLOPT_TIMEOUT, 10);
    	curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    	$data = curl_exec($curl);
    	if (curl_errno($curl)) {
    		return curl_error($curl);
    	}
    	curl_close($curl);
    	if($returnCookie){
    		list($header, $body) = explode("
    
    ", $data, 2);
    		preg_match_all("/Set-Cookie:([^;]*);/", $header, $matches);
    		$info['cookie']  = substr($matches[1][0], 1);
    		$info['content'] = $body;
    		return $info;
    	}else{
    		return $data;
    	}
    }
    
    $caiji_set = [];
    $caiji_url = '';
    for($i=0;$i<85;$i++){
    	// page-100 
    	//$url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt=NTYzOTU5NjY1OUBzaW5hLnNvaHUuY29t&pageNumber=".$i."&pageSize=10&categoryId=&_=1541053659128";
    	// page-85
    	$url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt=cHBhZzU5MTM5NjA2NmVlM0Bzb2h1LmNvbQ==&pageNumber=".$i."&pageSize=10&categoryId=&_=1541122188390";
    	$detail_url = curl_request($url,'GET', '');
    	$detail_data = json_decode($detail_url);
    	
    	$result = stripslashes(html_entity_decode($detail_data)); //传递过来的json字符串
    	$result = json_decode($result, TRUE);
    	
    	if($result['msg'] == 'succes'){
    		foreach($result['data'] as $k=>$item){
    			$caiji_set[$k][] = array(
    				"brief" => urldecode($item['brief']),
    				"thumbnail" => $item["thumbnail"],
    				"title"=>urldecode(title),
    				"url"=>"http:".$item["url"]
    			);
    			$caiji_url .= "http:".$item["url"].'<br/>';
    		}
    	}
    }
    if(!empty($caiji_url)){
    	//file_put_contents('./gougou.txt', $caiji_url, FILE_APPEND);
    }
    var_export($caiji_url);exit;
    
  • 相关阅读:
    Winform跨线程操作界面的策略
    Winform DataGridView扩展
    GDI+的常用类
    函数中参数的验证顺序
    C# Winform常见的Editor及其它经验
    能够引起异常的运算符和关键字
    终结程序
    C#异常处理策略
    python3.6入门到高阶(全栈) day01 python 基础
    虚拟机中系统盘扩容
  • 原文地址:https://www.cnblogs.com/feixiablog/p/9894573.html
Copyright © 2020-2023  润新知