• curl采集 根据关键词 获取雅虎竞价排名


    之前写过curl批处理采集数据,这里贴上完整版本,代码很简单,废话不说,上代码,新手欢迎指教!!!

    代码只写到 获取到链接了,至于排名 后边数组的键不就是排名喽。。。

      1 <?php
      2 /**
      3  * Based on yahoo access to data
      4  *
      5  * @author chujiu <527891885@qq.com>
      6  * @copyright 2014.04.26 By chujiu
      7  * @version 0.2.1 2014.04.26
      8  */
      9 
     10 class DataCollectionRank {
     11 
     12     const   PAGE = 10;
     13     public  $path = '';
     14     public  $main = 91;
     15     
     16     // 添加curl句柄 返回资源
     17     private function _gather_data($keyword) {
     18         if(empty($keyword)) {
     19             return '';
     20         }
     21         $chs = array(); // 句柄
     22         $mh = curl_multi_init();
     23         for( $i=1; $i<=$this->main; $i+=self::PAGE ) {
     24             $url = 'http://search.yahoo.co.jp/search?p='.urlencode($keyword).'&tid=top_ga1_sa&ei=UTF-8&aq=-1&oq='.urlencode($keyword).'&pstart=1&fr=top_ga1_sa&b='.$i;
     25             $ch = curl_init();
     26             //设置选项
     27             curl_setopt_array($ch, array(
     28                 CURLOPT_URL => $url,
     29                 CURLOPT_HEADER => false,
     30                 CURLOPT_SSL_VERIFYPEER => false,
     31                 CURLOPT_RETURNTRANSFER => true,
     32                 CURLOPT_TIMEOUT => 30,
     33                 CURLOPT_AUTOREFERER => true
     34                 )
     35             );
     36             curl_multi_add_handle($mh, $ch); // 添加批处理句柄
     37             $chs['handle'][$i]['ch'] = $ch;
     38             $chs['handle'][$i]['url'] = $url;
     39         }
     40         $chs['mh'] = $mh;
     41         return $chs;
     42     }
     43     
     44     // 处理CURL请求
     45     public function exec_curl_get_data($keyword, $path) {
     46         $error = '';
     47         $this->path = $path;
     48         $chs = $this->_gather_data($keyword);
     49         if(empty($chs)) return ''; 
     50          // 执行批处理句柄
     51         $active = null;
     52         do {
     53            $mrc = curl_multi_exec($chs['mh'],$active);
     54            //$info = curl_multi_info_read($chs['mh']);
     55         } while ($active > 0);
     56         // 获取数据
     57         $responses = array();
     58         foreach($chs['handle'] as $k=>$ch){ 
     59             if(curl_error($ch['ch'])){
     60                 $error .= "
    ".'error提示:'.curl_error($ch['ch']).'-------URL:'.$ch['url'].'--------时间:'.date('Y-d-m H:i:s',time())."
    ";
     61             } else {
     62                 $responses[$k]['data'] = curl_multi_getcontent( $ch['ch'] );
     63             }
     64             
     65             //curl_multi_info_read($mh);
     66             // close current handler 
     67             curl_multi_remove_handle($chs['mh'], $ch['ch']); 
     68             curl_close($ch['ch']);
     69         }
     70         //关闭curl 批处理
     71         curl_multi_close($chs['mh']);
     72         $str = '';
     73         if($error != '') {
     74             $this->_writeFile('get_rank_log.txt', $error, 'ab+');
     75         }
     76         foreach ($responses as $val) {
     77             if(!empty($val['data'])) {
     78                 $str.= $this->_get_keyword_link_preg($val['data']);
     79             }
     80         }
     81         $str = substr($str, 0 ,-1);
     82         $contents = explode('|', $str);
     83         return $contents;
     84     }
     85 
     86     // 过滤数据 获取链接
     87     private function _get_keyword_link_preg ($str) {
     88         $res = '';
     89         if(empty($str)) {
     90             return '';
     91         }
     92         $arr = explode('<div id="web">', $str);
     93         $arr1 = explode('<div id="posS" class="spns">', $arr[1]);
     94         $arr2 = preg_replace('#<div id="pg">[sS]+#', '', $arr1[0]);
     95         $arr3 = preg_replace('#<div id="rel">[sS]+#', '', $arr2);
     96         $arr4 = preg_replace('#<em>[sS]+?</em>#', '', $arr3);
     97         if(preg_match_all('#href="(.*?)">#',$arr4,$arr5) !== false) {
     98             foreach($arr5[1] as $val) {
     99                 $res.= urldecode($val).'|';
    100             }
    101         }
    102         return $res;
    103     }
    104 
    105     // 写入文件
    106     public function _writeFile($fileName, $data, $method="rb+", $iflock=1, $check=1, $chmod=1){
    107         $check && @strpos($this->path.'/'.$fileName, '..')!==false && exit('403 Forbidden!');
    108         @touch($this->path.'/'.$fileName);
    109         $handle = @fopen($this->path.'/'.$fileName, $method);
    110         if($iflock) {
    111             @flock($handle,LOCK_EX);
    112         }
    113         $fw = @fwrite($handle,$data);
    114         if($method == "rb+") ftruncate($handle, strlen($data));
    115         fclose($handle);
    116         $chmod && @chmod($this->path.'/'.$fileName,0777);
    117     }
    118 }
    119 ?>
     1 function array_unique_fb($array){
     2     $temp = array();
     3     $data = array();
     4     foreach ($array as $value){
     5         $value = join(",",$value); //降维,也可以用implode,将一维数组转换为用逗号连接的字符串
     6         $temp[] = $value;
     7     }
     8         $temp = array_flip(array_flip($temp));    //去掉重复的字符串,也就是重复的一维数组
     9     foreach ($temp as $k => $value){
    10         $temp[$k] = explode(",",$value);   //再将拆开的数组重新组装
    11     }
    12     foreach ($temp as $key => $value) {
    13         $data[$key]['keyword'] = $value[0];
    14         $data[$key]['domain'] = $value[1];
    15     }
    16     return $data;
    17 }
  • 相关阅读:
    函数
    字符编码转换
    文件读写与修改
    Java期末项目——校园商铺平台(三)
    Java期末项目——校园商铺平台(二)
    Java期末项目——校园商铺平台(一)
    LDAP & Implementation
    RESTful Levels HATEOAS
    隔离级别
    Servlet CDI Analysis
  • 原文地址:https://www.cnblogs.com/chujiuIt/p/3730552.html
Copyright © 2020-2023  润新知