• curl采集初历


    2天的采集学习  记录下

    目标网站 www.cupshe.com

    需求分析  采集目标站点商品主图,名称,价格,以及采集时间

    先建立cupshe的库

    目标站点全部商品的网址  www.cupshe.com/collections/all?page=1&sort_by=best-selling        //sort_by=best-selling的意思是按照销量排序

    分析  商品a标签的href中  可以和商品详情页面的product组成商品地址 

    那我们就先采集href中的商品详情地址

    先建立表

    表名product_urls

    CREATE TABLE `product_urls` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `url` varchar(355) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=MyISAM AUTO_INCREMENT=49 DEFAULT CHARSET=utf8;

    <?php 
    
        function geturl($url){
            
            
            
            //$sql = "SELECT * FROM product_variants";
            //$res = $conn->query($sql);
            /*
            try{
                $pdo=new PDOException("mysql:host=localhost;dbname=cupshe","root","");
                var_dump($pdo);
            }catch(PDOException $e){
                echo '数据库连接失败'.$e->getMessage();
            }
            $res = $pdo->query("SELECT * FROM product_variants");
            var_dump($res);
            die;
            
            
            */
            //var_dump($res);
            //die;
            $curl = curl_init();
            curl_setopt($curl,CURLOPT_URL,$url);                //要抓取的URL。在使用curl_init()初始化会话时也可以设置。
            curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);        //TRUE将转移返回为curl_exec()的返回值的字符串,而不是直接输出。
            curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); 
            curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);    //SSL 报错时使用
            curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);    //SSL 报错时使用
            curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查  true检查加密算法是否存在
            
    
            $res = curl_exec($curl);
            
            //print_r($res);
            
            curl_close($curl);
            
            return $res;
            
        }
        //$url = "https://www.cupshe.com";
        //https://www.cupshe.com/collections/all?page=1
        //$res = geturl($url);
        /*
         *  匹配字符串*/
        $conn = new mysqli("localhost","root","","cupshe");
            
            if (!$conn) {
                die("Connection failed: " . mysqli_connect_error());
            }
        //$sql = "select * from product_urls";    
        //$res = mysqli_query($conn,$sql);
        //$res = mysqli_fetch_all($res,MYSQLI_ASSOC);
        //var_dump($res);
        //die;
        $page_max = 35;
        $i = 1;
        while($i <= $page_max){
            
            
            $url = "https://www.cupshe.com";
            $url = $url."/collections/all?page={$i}&amp;sort_by=best-selling";
            $res = geturl($url);
            //$aaa = $url."/collections/all?page={$i}&amp;sort_by=best-selling";
            preg_match_all('/href="/products/(.+?)"/i', $res, $m);
            var_dump($m);
            $arr_products = array_values(array_unique($m[1]));
            foreach($arr_products as $k=>$v){
        
                $sql ="INSERT INTO product_urls (`url`) VALUES ('{$v}')";
                //echo $sql;
                $res = mysqli_query($conn,$sql);
            }
            $i++;
            //var_dump($res);
            
            //var_dump($arr_products);
            
        }
        
        mysqli_close($conn);
        
    ?>


    存储在数据库中然后再新建一个php文件  去采集商品详情页面的html文档 并且用正则匹配
    先建表 看需求
    CREATE TABLE `product_variants` (
      `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
      `name` text,
      `price` float(10,2) DEFAULT NULL,
      `image_src` text,
      `updated_at` varchar(255) DEFAULT NULL,
      `create_time` varchar(255) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=56 DEFAULT CHARSET=utf8;
    <?php
    date_default_timezone_set('PRC'); 
    function geturl($url){
        $curl = curl_init();
            curl_setopt($curl,CURLOPT_URL,$url);
            curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); 
            curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);    //SSL 报错时使用
            curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);    //SSL 报错时使用
            curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查  true检查加密算法是否存在
            
    
            $res = curl_exec($curl);
            
            //print_r($res);
            
            curl_close($curl);
            
            return $res;
            
    }
    //链接数据库
    $conn = new mysqli("localhost","root","","cupshe");
            
            if (!$conn) {
                die("Connection failed: " . mysqli_connect_error());
            }
    $sql="SELECT url FROM product_urls"; 
    $res=mysqli_query($conn,$sql); 
    //var_dump($res);        
    $res = mysqli_fetch_all($res,MYSQLI_ASSOC);    
        
    //var_dump($res);        
     foreach($res as $val){
         
         $urls = "https://www.cupshe.com/products/".$val['url'];
         //echo $urls; 
         //<span class="last-crumb">Cupshe  Amuse Society Halter Bikini Set</span>
         //<span class="money" doubly-currency-usd="1999" doubly-currency="USD">$19.99 USD</span>
         //<span class="money" id="ProductPrice" itemprop="price">21.80</span>
         $res = geturl($urls);
         
        preg_match('/src="//cdn.shopify.com/((?!js).)*d"/i', $res, $images);
        preg_match('/<span class="last-crumb">(.*?)</span>/is',$res,$names);
        preg_match('/<span class="money" id="ProductPrice" itemprop="price">(.*?)</span>/is',$res,$prices);
        $time = date("Y-m-dH:i:s"); 
        //var_dump($images);
        //echo $images[0];
        $image = substr($images[0],0,-1);
        $image = ltrim($image,"src="");
        $sql ="insert into product_variants (name,price,image_src,create_time) values ('{$names[1]}','{$prices[1]}','{$image}','{$time}')";
    //    echo $sql;
        $res = mysqli_query($conn,$sql);
        //var_dump($name);
        // var_dump($name);
        /* foreach($m[0] as $val){
            $m = substr($val,0,strlen($val)-1);
            $m = ltrim($m,"src="");
            //var_dump($m);
            
            
            
            
            //is_dir('./images/') ? '': mkdir('./images/'); 
            //file_put_contents('./images/'.$val['url'].'.jpg', $m);  
            
         }
         
         foreach($name[1] as $val_name){
             
             var_dump($val_name);
         }
        
         foreach($price[1] as $val_price){
             
             var_dump($val_price);
         }
          */
         
         
         
     }
    //var_dump($res);
    
    
    ?>

    完成之后  又开始写多线程  这是单线程的  

    多线程不详解   因为目前我也是半懂半不懂的

    参考地址http://www.cnblogs.com/loveyouyou616/p/5624139.html

    附上代码

    <?php
    //for循环 基础方案
    $start = microtime(true);
    
    header('Content-type:text/html;charset=utf-8');
    
    
    //链接数据库
    $conn = new mysqli("localhost","root","","cupshe");
            
            if (!$conn) {
                die("Connection failed: " . mysqli_connect_error());
            }
    $sql="SELECT url FROM product_urls"; 
    $res=mysqli_query($conn,$sql); 
    //var_dump($res);        
    $res = mysqli_fetch_all($res,MYSQLI_ASSOC);    
    //从数据库取url值  出来组装正确的url
    foreach($res as $val){
         
         $url_arr[] = "https://www.cupshe.com/products/".$val['url'];
    }
    
    
    
    $mh = curl_multi_init();
    
    foreach ($url_arr as $i=>$url){
        $curl = curl_init();
        curl_setopt($curl,CURLOPT_URL,$url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); 
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);    //SSL 报错时使用
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);    //SSL 报错时使用
        curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查
        
    
        if (strpos($url,'https')){
            curl_setopt ( $curl, CURLOPT_SSL_VERIFYPEER, false );
            curl_setopt ( $curl, CURLOPT_SSL_VERIFYHOST, 2 );
        }
    
        $res = curl_exec($curl);
        curl_close($curl);
        preg_match('/src="//cdn.shopify.com/((?!js).)*d"/i', $res, $images);
        preg_match('/<span class="last-crumb">(.*?)</span>/is',$res,$names);
        preg_match('/<span class="money" id="ProductPrice" itemprop="price">(.*?)</span>/is',$res,$prices);
        $time = date("Y-m-dH:i:s"); 
        //var_dump($images);
        //echo $images[0];
        $image = substr($images[0],0,-1);
        $image = ltrim($image,"src="");
        $sql ="insert into product_variants (name,price,image_src,create_time) values ('{$names[1]}','{$prices[1]}','{$image}','{$time}')";
        $res = mysqli_query($conn,$sql);
        var_dump($res);
    }
    
    
    $end = microtime(true) - $start;
    
    echo '<br/>';
    echo $end;  //平均19.002983093262s

    在说下  不知道为什么这台机器上pdo用不了   只能用mysqli了  第一次用    可能比较菜吧  慢慢学习吧  学习使我快乐

  • 相关阅读:
    【总结】数组去重的3种方式
    【原】运动版的轮播图,有左右按钮和单独分页,原生JS版
    【巩固】JS中的封闭空间
    IIS 配置.svc的MIME映射
    c# 线程定时器 System.Threading.Timer 转载
    QTcreator快捷操作,转载
    C#中的钩子说明
    anaconda更新库命令
    Chart控件,chart、Series、ChartArea曲线图绘制的重要属性介绍
    c# chart控件柱状图,改变柱子宽度
  • 原文地址:https://www.cnblogs.com/kimc1112/p/7156493.html
Copyright © 2020-2023  润新知