• 2个爬虫


    <?php
    namespace Utildata;
    use UtildataDbUtil;
    
    class Index_m
    {
      /*1,获取新闻首页
        2,获取新闻链接数组
        3,循环,将每个链接截取 作者、标题、内容,写入数组
        4,将数组写进数据库1
      */
    
      public function update_m(){
        $url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml';
        $curl = $this->curl($url);
        
        //获取子新闻们的url
        $urls = $this->geturls($curl);
    
        $news = [];
        $num = 0;
        //获取子链接每个新闻的标题、正文、url
        foreach($urls as $value){
          $new_curl = $this->curl($value);
          //如果是图集的话
          if(substr_count($new_curl, 'picBoxPrev')>0){
            echo '图集被删除';
              continue;
          }  
          //获取标题
          $title = $this->getKeyWord($new_curl,'<title>','</title>')[0];   
          if($x = strpos($title,'_凤凰')){
              $title = substr($title,0,$x);
          }
         
    
          //获取内容
          if(substr_count($new_curl,'<!--mainContent begin-->')>0){
            $body  = $this->getKeyWord($new_curl,'<!--mainContent begin-->','<span class="ifengLogo"><a')[0];
          }elseif(substr_count($new_curl,'<!-- 正文begin -->')>0){
            $body  = $this->getKeyWord($new_curl,' <!-- 正文begin -->','<span class="ifengLogo"><a')[0];
          }    
     
          
    
          //获取url
          $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';
         if(substr_count($body,'<img src="')>0){
            $img = $this->getKeyWord($body,'<img src="','">')[0];
            
            if(substr_count($img,'gif')>0){
                $index = strpos($img,'gif');
                $img = substr($img,0,$index+3);
            }
            elseif(substr_count($img,'jpeg')>0){
                  $index = strpos($img,'jpeg');
                  $img = substr($img,0,$index+4);
           }elseif(substr_count($img,'jpg')>0){
              $index = strpos($img,'jpg');
               $img = substr($img,0,$index+3);
           }
         }
         if(preg_match('/[x{4e00}-x{9fa5}]/u', $img)>0){
          $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';
         }
         /* $img = 'https://images2018.cnblogs.com/blog/1395514/201805/1395514-20180513105955459-1441660792.jpg';*/
    /*      $rule = '/^<img src="(*)"$/'
          preg_match_all($body, $rule,$img);*/
    
    
          if(strlen($body)<100||strlen($title)<20){
            continue;
          } 
            $news[$num]['news_title'] =  trim($title);
            $news[$num]['news_body'] = trim($body);
            $news[$num]['news_pic'] = trim($img);
            $news[$num]['news_autuor'] = '新闻网';
            $num++;
        }
        $arr = $this->do_sql($news);
        return  $arr;
        
      }
    
    
    
    
      //将新闻们写进数据库
      public function do_sql($news){
    
        $b = array_rand($news,6);
        foreach($b as $k =>$v){
          $arr[] = $news[$v];
        }
    
        //总数
        $sum = 0;
        //写进的数量
        $succ = 0;
        //重复的数量
        $ready = 0;
    
        foreach($arr as $value=>$key){
           $sum++;
           //去重
           if(DbUtil::getdb()->table('news')->where(array('news_title'=>$key['news_title']))->count()>0){
              $ready++;
              continue;
           }
           $title = $key['news_title'];
           if(DbUtil::getdb()->table('news')->insert($key)){
              $succ++;
           }
          print_r($key['news_title']);
        }
        //$sum:总数  $succ:成功个数 $ready:重复的个数
          return array($sum,$succ,$ready,);
       
      }
      //获取子链接
      public function geturls($curl){
        $urls = [];
        $url = $this->getKeyWord($curl,'<h2>即时新闻</h2>','<div class="clear"></div>')[0];
        $index = 0;
        for($x = 0;$x<20;$x++){
          $arr  = $this->getKeyWord($url,'<a href="','" target="_blank">',$index);
          $urls []  = trim($arr[0]);
          $index = $arr[1];
        }
        return $urls;
      }
    
    
    /*截取有用的子串(爬虫相关)
    $info=网页  $first_key=开始的字符串  $last_key=结束的字符串
    return 中间的字符串;
      $index:结束字符串的索引(选填)*/
    function getKeyWord($info,$first_key,$last_key,$index = 0){
        $len = strlen($first_key);
        $first_key_start = strpos($info,$first_key,$index);
        $last_key_start = strpos($info,$last_key,$first_key_start);
        $keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len));
        //return array(关键词,最后的索引,方便循环)
        return array($keyword,$last_key_start);
    }
    
    /*$url :html链接
    return :解析后的html文档(字符串)
    获取CURL请求的输出信息,这个可以爬取https,非常好*/
    function curl($url,$coding='utf-8') { 
        //初始化
        $ch = curl_init();
        //设置选项,包括url
    
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回,而不是直接输出。
      
      /* curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向*/
        //不验证证书和host
    /*    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);*/
    
    
        $result = curl_exec($ch);
        //释放curl句柄
        curl_close($ch);
          //如果网站不是utf-8编码的话要转码
          if($coding!='utf-8'){
              $result= iconv($coding,"utf-8//IGNORE",$result);  
          }   
        return $result;   
    }
    
    }
    ?>
  • 相关阅读:
    第13组_16通信3班_045_OSPFv3作业
    RIPng配置(第十三组)
    基于IPV6的数据包分析(更新拓扑加入了linux主机和抓取133icmp包)(第十三组)
    vmware vsphere powercli 因为在此系统中禁止执行脚本
    vmware virtual machine must be running in order to be migrated
    flashback transaction闪回事务查询
    oracle 闪回功能详解
    linux下修改/dev/shm tmpfs文件系统大小
    vmware虚拟机guest系统重启后获得169.254.X.X的ip解决方法
    一键部署 PPTP server
  • 原文地址:https://www.cnblogs.com/cl94/p/9032322.html
Copyright © 2020-2023  润新知