2天的采集学习 记录下
目标网站 www.cupshe.com
需求分析 采集目标站点商品主图,名称,价格,以及采集时间
先建立cupshe的库
目标站点全部商品的网址 www.cupshe.com/collections/all?page=1&sort_by=best-selling //sort_by=best-selling的意思是按照销量排序
分析 商品a标签的href中 可以和商品详情页面的product组成商品地址
那我们就先采集href中的商品详情地址
先建立表
表名product_urls
CREATE TABLE `product_urls` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`url` varchar(355) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=49 DEFAULT CHARSET=utf8;
<?php function geturl($url){ //$sql = "SELECT * FROM product_variants"; //$res = $conn->query($sql); /* try{ $pdo=new PDOException("mysql:host=localhost;dbname=cupshe","root",""); var_dump($pdo); }catch(PDOException $e){ echo '数据库连接失败'.$e->getMessage(); } $res = $pdo->query("SELECT * FROM product_variants"); var_dump($res); die; */ //var_dump($res); //die; $curl = curl_init(); curl_setopt($curl,CURLOPT_URL,$url); //要抓取的URL。在使用curl_init()初始化会话时也可以设置。 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); //TRUE将转移返回为curl_exec()的返回值的字符串,而不是直接输出。 curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //SSL 报错时使用 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); //SSL 报错时使用 curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查 true检查加密算法是否存在 $res = curl_exec($curl); //print_r($res); curl_close($curl); return $res; } //$url = "https://www.cupshe.com"; //https://www.cupshe.com/collections/all?page=1 //$res = geturl($url); /* * 匹配字符串*/ $conn = new mysqli("localhost","root","","cupshe"); if (!$conn) { die("Connection failed: " . mysqli_connect_error()); } //$sql = "select * from product_urls"; //$res = mysqli_query($conn,$sql); //$res = mysqli_fetch_all($res,MYSQLI_ASSOC); //var_dump($res); //die; $page_max = 35; $i = 1; while($i <= $page_max){ $url = "https://www.cupshe.com"; $url = $url."/collections/all?page={$i}&sort_by=best-selling"; $res = geturl($url); //$aaa = $url."/collections/all?page={$i}&sort_by=best-selling"; preg_match_all('/href="/products/(.+?)"/i', $res, $m); var_dump($m); $arr_products = array_values(array_unique($m[1])); foreach($arr_products as $k=>$v){ $sql ="INSERT INTO product_urls (`url`) VALUES ('{$v}')"; //echo $sql; $res = mysqli_query($conn,$sql); } $i++; //var_dump($res); //var_dump($arr_products); } mysqli_close($conn); ?>
存储在数据库中然后再新建一个php文件 去采集商品详情页面的html文档 并且用正则匹配
先建表 看需求
CREATE TABLE `product_variants` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`name` text,
`price` float(10,2) DEFAULT NULL,
`image_src` text,
`updated_at` varchar(255) DEFAULT NULL,
`create_time` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=56 DEFAULT CHARSET=utf8;
<?php date_default_timezone_set('PRC'); function geturl($url){ $curl = curl_init(); curl_setopt($curl,CURLOPT_URL,$url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //SSL 报错时使用 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); //SSL 报错时使用 curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查 true检查加密算法是否存在 $res = curl_exec($curl); //print_r($res); curl_close($curl); return $res; } //链接数据库 $conn = new mysqli("localhost","root","","cupshe"); if (!$conn) { die("Connection failed: " . mysqli_connect_error()); } $sql="SELECT url FROM product_urls"; $res=mysqli_query($conn,$sql); //var_dump($res); $res = mysqli_fetch_all($res,MYSQLI_ASSOC); //var_dump($res); foreach($res as $val){ $urls = "https://www.cupshe.com/products/".$val['url']; //echo $urls; //<span class="last-crumb">Cupshe Amuse Society Halter Bikini Set</span> //<span class="money" doubly-currency-usd="1999" doubly-currency="USD">$19.99 USD</span> //<span class="money" id="ProductPrice" itemprop="price">21.80</span> $res = geturl($urls); preg_match('/src="//cdn.shopify.com/((?!js).)*d"/i', $res, $images); preg_match('/<span class="last-crumb">(.*?)</span>/is',$res,$names); preg_match('/<span class="money" id="ProductPrice" itemprop="price">(.*?)</span>/is',$res,$prices); $time = date("Y-m-dH:i:s"); //var_dump($images); //echo $images[0]; $image = substr($images[0],0,-1); $image = ltrim($image,"src=""); $sql ="insert into product_variants (name,price,image_src,create_time) values ('{$names[1]}','{$prices[1]}','{$image}','{$time}')"; // echo $sql; $res = mysqli_query($conn,$sql); //var_dump($name); // var_dump($name); /* foreach($m[0] as $val){ $m = substr($val,0,strlen($val)-1); $m = ltrim($m,"src=""); //var_dump($m); //is_dir('./images/') ? '': mkdir('./images/'); //file_put_contents('./images/'.$val['url'].'.jpg', $m); } foreach($name[1] as $val_name){ var_dump($val_name); } foreach($price[1] as $val_price){ var_dump($val_price); } */ } //var_dump($res); ?>
完成之后 又开始写多线程 这是单线程的
多线程不详解 因为目前我也是半懂半不懂的
参考地址http://www.cnblogs.com/loveyouyou616/p/5624139.html
附上代码
<?php //for循环 基础方案 $start = microtime(true); header('Content-type:text/html;charset=utf-8'); //链接数据库 $conn = new mysqli("localhost","root","","cupshe"); if (!$conn) { die("Connection failed: " . mysqli_connect_error()); } $sql="SELECT url FROM product_urls"; $res=mysqli_query($conn,$sql); //var_dump($res); $res = mysqli_fetch_all($res,MYSQLI_ASSOC); //从数据库取url值 出来组装正确的url foreach($res as $val){ $url_arr[] = "https://www.cupshe.com/products/".$val['url']; } $mh = curl_multi_init(); foreach ($url_arr as $i=>$url){ $curl = curl_init(); curl_setopt($curl,CURLOPT_URL,$url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //SSL 报错时使用 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); //SSL 报错时使用 curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查 if (strpos($url,'https')){ curl_setopt ( $curl, CURLOPT_SSL_VERIFYPEER, false ); curl_setopt ( $curl, CURLOPT_SSL_VERIFYHOST, 2 ); } $res = curl_exec($curl); curl_close($curl); preg_match('/src="//cdn.shopify.com/((?!js).)*d"/i', $res, $images); preg_match('/<span class="last-crumb">(.*?)</span>/is',$res,$names); preg_match('/<span class="money" id="ProductPrice" itemprop="price">(.*?)</span>/is',$res,$prices); $time = date("Y-m-dH:i:s"); //var_dump($images); //echo $images[0]; $image = substr($images[0],0,-1); $image = ltrim($image,"src=""); $sql ="insert into product_variants (name,price,image_src,create_time) values ('{$names[1]}','{$prices[1]}','{$image}','{$time}')"; $res = mysqli_query($conn,$sql); var_dump($res); } $end = microtime(true) - $start; echo '<br/>'; echo $end; //平均19.002983093262s
在说下 不知道为什么这台机器上pdo用不了 只能用mysqli了 第一次用 可能比较菜吧 慢慢学习吧 学习使我快乐