[php]代码库
<?php |
// +---------------------------------------------------------------------- |
// | ThinkPHP [ WE CAN DO IT JUST THINK IT ] |
// +---------------------------------------------------------------------- |
// | Copyright (c) 2009 http://thinkphp.cn All rights reserved. |
// +---------------------------------------------------------------------- |
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 ) |
// +---------------------------------------------------------------------- |
// | Author: liu21st <liu21st@gmail.com> |
// +---------------------------------------------------------------------- |
/** |
* Http 工具类 |
* 提供一系列的Http方法 |
* @category ORG |
* @package ORG |
* @subpackage Net |
* @author liu21st <liu21st@gmail.com> |
*/ |
class Http { |
/** |
* 采集远程文件 |
* @access public |
* @param string $remote 远程文件名 |
* @param string $local 本地保存文件名 |
* @return mixed |
*/ |
static public function curlDownload( $remote , $local ) { |
$cp = curl_init( $remote ); |
$fp = fopen ( $local , "w" ); |
curl_setopt( $cp , CURLOPT_FILE, $fp ); |
curl_setopt( $cp , CURLOPT_HEADER, 0); |
curl_exec( $cp ); |
curl_close( $cp ); |
fclose( $fp ); |
} |
/** |
* 使用 fsockopen 通过 HTTP 协议直接访问(采集)远程文件 |
* 如果主机或服务器没有开启 CURL 扩展可考虑使用 |
* fsockopen 比 CURL 稍慢,但性能稳定 |
* @static |
* @access public |
* @param string $url 远程URL |
* @param array $conf 其他配置信息 |
* int limit 分段读取字符个数 |
* string post post的内容,字符串或数组,key=value&形式 |
* string cookie 携带cookie访问,该参数是cookie内容 |
* string ip 如果该参数传入,$url将不被使用,ip访问优先 |
* int timeout 采集超时时间 |
* bool block 是否阻塞访问,默认为true |
* @return mixed |
*/ |
static public function fsockopenDownload( $url , $conf = array ()) { |
$return = '' ; |
if (! is_array ( $conf )) return $return ; |
$matches = parse_url ( $url ); |
!isset( $matches [ 'host' ]) && $matches [ 'host' ] = '' ; |
!isset( $matches [ 'path' ]) && $matches [ 'path' ] = '' ; |
!isset( $matches [ 'query' ]) && $matches [ 'query' ] = '' ; |
!isset( $matches [ 'port' ]) && $matches [ 'port' ] = '' ; |
$host = $matches [ 'host' ]; |
$path = $matches [ 'path' ] ? $matches [ 'path' ].( $matches [ 'query' ] ? '?' . $matches [ 'query' ] : '' ) : '/' ; |
$port = ! empty ( $matches [ 'port' ]) ? $matches [ 'port' ] : 80; |
$conf_arr = array ( |
'limit' => 0, |
'post' => '' , |
'cookie' => '' , |
'ip' => '' , |
'timeout' => 15, |
'block' => TRUE, |
); |
foreach ( array_merge ( $conf_arr , $conf ) as $k => $v ) ${ $k } = $v ; |
if ( $post ) { |
if ( is_array ( $post )) |
{ |
$post = http_build_query( $post ); |
} |
$out = "POST $path HTTP/1.0
" ; |
$out .= "Accept: */*
" ; |
//$out .= "Referer: $boardurl
"; |
$out .= "Accept-Language: zh-cn
" ; |
$out .= "Content-Type: application/x-www-form-urlencoded
" ; |
$out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]
" ; |
$out .= "Host: $host
" ; |
$out .= 'Content-Length: ' . strlen ( $post ). "
" ; |
$out .= "Connection: Close
" ; |
$out .= "Cache-Control: no-cache
" ; |
$out .= "Cookie: $cookie
" ; |
$out .= $post ; |
} else { |
$out = "GET $path HTTP/1.0
" ; |
$out .= "Accept: */*
" ; |
//$out .= "Referer: $boardurl
"; |
$out .= "Accept-Language: zh-cn
" ; |
$out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]
" ; |
$out .= "Host: $host
" ; |
$out .= "Connection: Close
" ; |
$out .= "Cookie: $cookie
" ; |
} |
$fp = @ fsockopen (( $ip ? $ip : $host ), $port , $errno , $errstr , $timeout ); |
if (! $fp ) { |
return '' ; |
} else { |
stream_set_blocking( $fp , $block ); |
stream_set_timeout( $fp , $timeout ); |
@fwrite( $fp , $out ); |
$status = stream_get_meta_data( $fp ); |
if (! $status [ 'timed_out' ]) { |
while (! feof ( $fp )) { |
if (( $header = @ fgets ( $fp )) && ( $header == "
" || $header == "
" )) { |
break ; |
} |
} |
$stop = false; |
while (! feof ( $fp ) && ! $stop ) { |
$data = fread ( $fp , ( $limit == 0 || $limit > 8192 ? 8192 : $limit )); |
$return .= $data ; |
if ( $limit ) { |
$limit -= strlen ( $data ); |
$stop = $limit <= 0; |
} |
} |
} |
@fclose( $fp ); |
return $return ; |
} |
} |
/** |
* 下载文件 |
* 可以指定下载显示的文件名,并自动发送相应的Header信息 |
* 如果指定了content参数,则下载该参数的内容 |
* @static |
* @access public |
* @param string $filename 下载文件名 |
* @param string $showname 下载显示的文件名 |
* @param string $content 下载的内容 |
* @param integer $expire 下载内容浏览器缓存时间 |
* @return void |
*/ |
static public function download ( $filename , $showname = '' , $content = '' , $expire =180) { |
if ( is_file ( $filename )) { |
$length = filesize ( $filename ); |
} elseif ( is_file (UPLOAD_PATH. $filename )) { |
$filename = UPLOAD_PATH. $filename ; |
$length = filesize ( $filename ); |
} elseif ( $content != '' ) { |
$length = strlen ( $content ); |
} else { |
throw_exception( $filename .L( '下载文件不存在!' )); |
} |
if ( empty ( $showname )) { |
$showname = $filename ; |
} |
$showname = basename ( $showname ); |
if (! empty ( $filename )) { |
$type = mime_content_type( $filename ); |
} else { |
$type = "application/octet-stream" ; |
} |
//发送Http Header信息 开始下载 |
header( "Pragma: public" ); |
header( "Cache-control: max-age=" . $expire ); |
//header('Cache-Control: no-store, no-cache, must-revalidate'); |
header( "Expires: " . gmdate ( "D, d M Y H:i:s" ,time()+ $expire ) . "GMT" ); |
header( "Last-Modified: " . gmdate ( "D, d M Y H:i:s" ,time()) . "GMT" ); |
header( "Content-Disposition: attachment; filename=" . $showname ); |
header( "Content-Length: " . $length ); |
header( "Content-type: " . $type ); |
header( 'Content-Encoding: none' ); |
header( "Content-Transfer-Encoding: binary" ); |
if ( $content == '' ) { |
readfile( $filename ); |
} else { |
echo ( $content ); |
} |
exit (); |
} |
/** |
* 显示HTTP Header 信息 |
* @return string |
*/ |
static function getHeaderInfo( $header = '' , $echo =true) { |
ob_start(); |
$headers = getallheaders (); |
if (! empty ( $header )) { |
$info = $headers [ $header ]; |
echo ( $header . ':' . $info . "
" ); ; |
} else { |
foreach ( $headers as $key => $val ) { |
echo ( "$key:$val
" ); |
} |
} |
$output = ob_get_clean(); |
if ( $echo ) { |
echo ( nl2br ( $output )); |
} else { |
return $output ; |
} |
} |
/** |
* HTTP Protocol defined status codes |
* @param int $num |
*/ |
static function sendHttpStatus( $code ) { |
static $_status = array ( |
// Informational 1xx |
100 => 'Continue' , |
101 => 'Switching Protocols' , |
// Success 2xx |
200 => 'OK' , |
201 => 'Created' , |
202 => 'Accepted' , |
203 => 'Non-Authoritative Information' , |
204 => 'No Content' , |
205 => 'Reset Content' , |
206 => 'Partial Content' , |
// Redirection 3xx |
300 => 'Multiple Choices' , |
301 => 'Moved Permanently' , |
302 => 'Found' , // 1.1 |
303 => 'See Other' , |
304 => 'Not Modified' , |
305 => 'Use Proxy' , |
// 306 is deprecated but reserved |
307 => 'Temporary Redirect' , |
// Client Error 4xx |
400 => 'Bad Request' , |
401 => 'Unauthorized' , |
402 => 'Payment Required' , |
403 => 'Forbidden' , |
404 => 'Not Found' , |
405 => 'Method Not Allowed' , |
406 => 'Not Acceptable' , |
407 => 'Proxy Authentication Required' , |
408 => 'Request Timeout' , |
409 => 'Conflict' , |
410 => 'Gone' , |
411 => 'Length Required' , |
412 => 'Precondition Failed' , |
413 => 'Request Entity Too Large' , |
414 => 'Request-URI Too Long' , |
415 => 'Unsupported Media Type' , |
416 => 'Requested Range Not Satisfiable' , |
417 => 'Expectation Failed' , |
// Server Error 5xx |
500 => 'Internal Server Error' , |
501 => 'Not Implemented' , |
502 => 'Bad Gateway' , |
503 => 'Service Unavailable' , |
504 => 'Gateway Timeout' , |
505 => 'HTTP Version Not Supported' , |
509 => 'Bandwidth Limit Exceeded' |
); |
if (isset( $_status [ $code ])) { |
header( 'HTTP/1.1 ' . $code . ' ' . $_status [ $code ]); |
} |
} |
} //类定义结束 |
if ( !function_exists ( 'mime_content_type' )) { |
/** |
* 获取文件的mime_content类型 |
* @return string |
*/ |
function mime_content_type( $filename ) { |
static $contentType = array ( |
'ai' => 'application/postscript' , |
'aif' => 'audio/x-aiff' , |
'aifc' => 'audio/x-aiff' , |
'aiff' => 'audio/x-aiff' , |
'asc' => 'application/pgp' , //changed by skwashd - was text/plain |
'asf' => 'video/x-ms-asf' , |
'asx' => 'video/x-ms-asf' , |
'au' => 'audio/basic' , |
'avi' => 'video/x-msvideo' , |
'bcpio' => 'application/x-bcpio' , |
'bin' => 'application/octet-stream' , |
'bmp' => 'image/bmp' , |
'c' => 'text/plain' , // or 'text/x-csrc', //added by skwashd |
'cc' => 'text/plain' , // or 'text/x-c++src', //added by skwashd |
'cs' => 'text/plain' , //added by skwashd - for C# src |
'cpp' => 'text/x-c++src' , //added by skwashd |
'cxx' => 'text/x-c++src' , //added by skwashd |
'cdf' => 'application/x-netcdf' , |
'class' => 'application/octet-stream' , //secure but application/java-class is correct |
'com' => 'application/octet-stream' , //added by skwashd |
'cpio' => 'application/x-cpio' , |
'cpt' => 'application/mac-compactpro' , |
'csh' => 'application/x-csh' , |
'css' => 'text/css' , |
'csv' => 'text/comma-separated-values' , //added by skwashd |
'dcr' => 'application/x-director' , |
'diff' => 'text/diff' , |
'dir' => 'application/x-director' , |
'dll' => 'application/octet-stream' , |
'dms' => 'application/octet-stream' , |
'doc' => 'application/msword' , |
'dot' => 'application/msword' , //added by skwashd |
'dvi' => 'application/x-dvi' , |
'dxr' => 'application/x-director' , |
'eps' => 'application/postscript' , |
'etx' => 'text/x-setext' , |
'exe' => 'application/octet-stream' , |
'ez' => 'application/andrew-inset' , |
'gif' => 'image/gif' , |
'gtar' => 'application/x-gtar' , |
'gz' => 'application/x-gzip' , |
'h' => 'text/plain' , // or 'text/x-chdr',//added by skwashd |
'h++' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd |
'hh' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd |
'hpp' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd |
'hxx' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd |
'hdf' => 'application/x-hdf' , |
'hqx' => 'application/mac-binhex40' , |
'htm' => 'text/html' , |
'html' => 'text/html' , |
'ice' => 'x-conference/x-cooltalk' , |
'ics' => 'text/calendar' , |
'ief' => 'image/ief' , |
'ifb' => 'text/calendar' , |
'iges' => 'model/iges' , |
'igs' => 'model/iges' , |
'jar' => 'application/x-jar' , //added by skwashd - alternative mime type |
'java' => 'text/x-java-source' , //added by skwashd |
'jpe' => 'image/jpeg' , |
'jpeg' => 'image/jpeg' , |
'jpg' => 'image/jpeg' , |
'js' => 'application/x-javascript' , |
'kar' => 'audio/midi' , |
'latex' => 'application/x-latex' , |
'lha' => 'application/octet-stream' , |
'log' => 'text/plain' , |
'lzh' => 'application/octet-stream' , |
'm3u' => 'audio/x-mpegurl' , |
'man' => 'application/x-troff-man' , |
'me' => 'application/x-troff-me' , |
'mesh' => 'model/mesh' , |
'mid' => 'audio/midi' , |
'midi' => 'audio/midi' , |
'mif' => 'application/vnd.mif' , |
'mov' => 'video/quicktime' , |
'movie' => 'video/x-sgi-movie' , |
'mp2' => 'audio/mpeg' , |
'mp3' => 'audio/mpeg' , |
'mpe' => 'video/mpeg' , |
'mpeg' => 'video/mpeg' , |
'mpg' => 'video/mpeg' , |
'mpga' => 'audio/mpeg' , |
'ms' => 'application/x-troff-ms' , |
'msh' => 'model/mesh' , |
'mxu' => 'video/vnd.mpegurl' , |
'nc' => 'application/x-netcdf' , |
'oda' => 'application/oda' , |
'patch' => 'text/diff' , |
'pbm' => 'image/x-portable-bitmap' , |
'pdb' => 'chemical/x-pdb' , |
'pdf' => 'application/pdf' , |
'pgm' => 'image/x-portable-graymap' , |
'pgn' => 'application/x-chess-pgn' , |
'pgp' => 'application/pgp' , //added by skwashd |
'php' => 'application/x-httpd-php' , |
'php3' => 'application/x-httpd-php3' , |
'pl' => 'application/x-perl' , |
'pm' => 'application/x-perl' , |
'png' => 'image/png' , |
'pnm' => 'image/x-portable-anymap' , |
'po' => 'text/plain' , |
'ppm' => 'image/x-portable-pixmap' , |
'ppt' => 'application/vnd.ms-powerpoint' , |
'ps' => 'application/postscript' , |
'qt' => 'video/quicktime' , |
'ra' => 'audio/x-realaudio' , |
'rar' => 'application/octet-stream' , |
'ram' => 'audio/x-pn-realaudio' , |
'ras' => 'image/x-cmu-raster' , |
'rgb' => 'image/x-rgb' , |
'rm' => 'audio/x-pn-realaudio' , |
'roff' => 'application/x-troff' , |
'rpm' => 'audio/x-pn-realaudio-plugin' , |
'rtf' => 'text/rtf' , |
'rtx' => 'text/richtext' , |
'sgm' => 'text/sgml' , |
'sgml' => 'text/sgml' , |
'sh' => 'application/x-sh' , |
'shar' => 'application/x-shar' , |
'shtml' => 'text/html' , |
'silo' => 'model/mesh' , |
'sit' => 'application/x-stuffit' , |
'skd' => 'application/x-koan' , |
'skm' => 'application/x-koan' , |
'skp' => 'application/x-koan' , |
'skt' => 'application/x-koan' , |
'smi' => 'application/smil' , |
'smil' => 'application/smil' , |
'snd' => 'audio/basic' , |
'so' => 'application/octet-stream' , |
'spl' => 'application/x-futuresplash' , |
'src' => 'application/x-wais-source' , |
'stc' => 'application/vnd.sun.xml.calc.template' , |
'std' => 'application/vnd.sun.xml.draw.template' , |
'sti' => 'application/vnd.sun.xml.impress.template' , |
'stw' => 'application/vnd.sun.xml.writer.template' , |
'sv4cpio' => 'application/x-sv4cpio' , |
'sv4crc' => 'application/x-sv4crc' , |
'swf' => 'application/x-shockwave-flash' , |
'sxc' => 'application/vnd.sun.xml.calc' , |
'sxd' => 'application/vnd.sun.xml.draw' , |
'sxg' => 'application/vnd.sun.xml.writer.global' , |
'sxi' => 'application/vnd.sun.xml.impress' , |
'sxm' => 'application/vnd.sun.xml.math' , |
'sxw' => 'application/vnd.sun.xml.writer' , |
't' => 'application/x-troff' , |
'tar' => 'application/x-tar' , |
'tcl' => 'application/x-tcl' , |
'tex' => 'application/x-tex' , |
'texi' => 'application/x-texinfo' , |
'texinfo' => 'application/x-texinfo' , |
'tgz' => 'application/x-gtar' , |
'tif' => 'image/tiff' , |
'tiff' => 'image/tiff' , |
'tr' => 'application/x-troff' , |
'tsv' => 'text/tab-separated-values' , |
'txt' => 'text/plain' , |
'ustar' => 'application/x-ustar' , |
'vbs' => 'text/plain' , //added by skwashd - for obvious reasons |
'vcd' => 'application/x-cdlink' , |
'vcf' => 'text/x-vcard' , |
'vcs' => 'text/calendar' , |
'vfb' => 'text/calendar' , |
'vrml' => 'model/vrml' , |
'vsd' => 'application/vnd.visio' , |
'wav' => 'audio/x-wav' , |
'wax' => 'audio/x-ms-wax' , |
'wbmp' => 'image/vnd.wap.wbmp' , |
'wbxml' => 'application/vnd.wap.wbxml' , |
'wm' => 'video/x-ms-wm' , |
'wma' => 'audio/x-ms-wma' , |
'wmd' => 'application/x-ms-wmd' , |
'wml' => 'text/vnd.wap.wml' , |
'wmlc' => 'application/vnd.wap.wmlc' , |
'wmls' => 'text/vnd.wap.wmlscript' , |
'wmlsc' => 'application/vnd.wap.wmlscriptc' , |
'wmv' => 'video/x-ms-wmv' , |
'wmx' => 'video/x-ms-wmx' , |
'wmz' => 'application/x-ms-wmz' , |
'wrl' => 'model/vrml' , |
'wvx' => 'video/x-ms-wvx' , |
'xbm' => 'image/x-xbitmap' , |
'xht' => 'application/xhtml+xml' , |
'xhtml' => 'application/xhtml+xml' , |
'xls' => 'application/vnd.ms-excel' , |
'xlt' => 'application/vnd.ms-excel' , |
'xml' => 'application/xml' , |
'xpm' => 'image/x-xpixmap' , |
'xsl' => 'text/xml' , |
'xwd' => 'image/x-xwindowdump' , |
'xyz' => 'chemical/x-xyz' , |
'z' => 'application/x-compress' , |
'zip' => 'application/zip' , |
); |
$type = strtolower ( substr ( strrchr ( $filename , '.' ),1)); |
if (isset( $contentType [ $type ])) { |
$mime = $contentType [ $type ]; |
} else { |
$mime = 'application/octet-stream' ; |
} |
return $mime ; |
} |
} |
if (!function_exists( 'image_type_to_extension' )){ |
function image_type_to_extension( $imagetype ) { |
if ( empty ( $imagetype )) return false; |
switch ( $imagetype ) { |
case IMAGETYPE_GIF : return '.gif' ; |
case IMAGETYPE_JPEG : return '.jpg' ; |
case IMAGETYPE_PNG : return '.png' ; |
case IMAGETYPE_SWF : return '.swf' ; |
case IMAGETYPE_PSD : return '.psd' ; |
case IMAGETYPE_BMP : return '.bmp' ; |
case IMAGETYPE_TIFF_II : return '.tiff' ; |
case IMAGETYPE_TIFF_MM : return '.tiff' ; |
case IMAGETYPE_JPC : return '.jpc' ; |
case IMAGETYPE_JP2 : return '.jp2' ; |
case IMAGETYPE_JPX : return '.jpf' ; |
case IMAGETYPE_JB2 : return '.jb2' ; |
case IMAGETYPE_SWC : return '.swc' ; |
case IMAGETYPE_IFF : return '.aiff' ; |
case IMAGETYPE_WBMP : return '.wbmp' ; |
case IMAGETYPE_XBM : return '.xbm' ; |
default : return false; |
} |
} |
}
JQERUY方式筛选采集内容,相信很多大牛都知道这个类库,可自学出身的我还是找了N久,phpquery,Snoopy等一遍一遍尝试,最后才在无意中找到phpSimpleHtmlDom,更让人惊喜的是又找到了中文手册.
一个人的学习,漫长而又艰辛,真希望有时候能得到指点,不至于让时间无辜的流失.
基础代码获取网页建议用CURL,附加POST数据可以登陆后采集
- <?php
- require_once('./simple_html_dom.php');
-
- $url='http://www.w3cschool.cc/';
- $Curl=curl_init();//实例化cURL
- curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
- curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
- curl_setopt($Curl, CURLOPT_HEADER, 1);//0关闭打印相应头,直接打印需为1,
- $result=curl_exec($Curl);//执行一个cURL会话
- curl_close($Curl);//关闭cURL会话
-
- $html = str_get_html($result);//创建DOM
- foreach($html->find('#leftcolumn a') as $element) {
- echo $element->href . '<br>';//获取URL
- echo $element->plaintext . '<br>';//获取纯文本
- }
-
- $html->clear();
- unset($html);
中文手册(作者: S.C. Chen):
http://www.ecartchina.com/php-simple-html-dom/index.htm
采集淘宝测试
- require_once('simple_html_dom.php');
- ini_set("time_limit","0");
- ini_set("memory_limit","512M");
- $memory=memory_get_usage();
- echo 'memory:'.($memory/1024).'KB<br/>';
- echo 'time:'.date('H:i:s',time()).'<br/>';
-
- function curl_get_content($url){
- $Curl=curl_init();//实例化cURL
- curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
- curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
- curl_setopt($Curl, CURLOPT_HEADER, 0);//0关闭打印相应头,直接打印需为1,
- $result=curl_exec($Curl);//执行一个cURL会话
- curl_close($Curl);//关闭cURL会话
- return $result;
- }
-
- $cateUrl='http://the-seventh-sense.taobao.com/';
- $cateCon=curl_get_content($cateUrl);
- $cateHtml = str_get_html($cateCon);//创建DOM
- $CateList=array();
- $i=0;
- foreach($cateHtml->find('.J_TAllCatsTree li .fst-cat-hd a[href*=category]') as $element) {
- $CateList[$i]['url']=urldecode($element->href);//获取URL
- $CateList[$i]['name']=$element->plaintext;//获取纯文本
- $i++;
- }
- $cateHtml->clear();
- unset($cateHtml);
-
- $i=0;
- foreach ($CateList as $goodsUrl) {
- $goodsCon=curl_get_content($goodsUrl['url']);
- $goodsHtml = str_get_html($goodsCon);//创建DOM
- $goodsBlock=$goodsHtml->find('.shop-hesper-bd .item');
- foreach($goodsBlock as $goodsElement ) {
- $goodsList[$i]['name']=$goodsElement->find(".detail .item-name",0)->plaintext;
- $goodsList[$i]['price']=$goodsElement->find(".detail .c-price",0)->plaintext;
- $goodsList[$i]['img']=$goodsElement->find(".photo a img",0)->src;
- $goodsList[$i]['catename']=$goodsUrl['name'];
- $i++;
- }
- $goodsHtml->clear();
- unset($goodsHtml);
- }
-
- echo '<hr/>';
-
- $n1=count($CateList);
- $n2=count($goodsList);
- echo '采集'.$n1.'条栏目'.$n2.'个商品<br/>';
-
- $memory=memory_get_usage();
- echo 'memory:'.($memory/1024).'KB<br/>';
- echo 'time:'.date('H:i:s',time()).'<br/>';
beginmemory:971.953125KB
begintime:05:30:19
overmemory:1352.890625KB
overtime:05:30:39
耗时20s,成功采集9个栏目127个商品
phpQuery是一个基于PHP的服务端开源项目,它可以让PHP开发人员轻松处理DOM文档内容,比如获取某新闻网站的头条信息。更有意思的是,它采用了jQuery的思想,你可以像使用jQuery一样处理页面内容,获取你想要的页面信息。
采集头条
先看一实例,现在我要采集新浪网国内新闻的头条,代码如下:
include 'phpQuery/phpQuery.php';
phpQuery::newDocumentFile('http://news.sina.com.cn/china');
echo pq(".blkTop h1:eq(0)")->html();
简单的三行代码,就可以获取头条内容。首先在程序中包含phpQuery.php核心程序,然后调用读取目标网页,最后输出对应标签下的内容。
pq()是一个功能强大的方法,跟jQuery的$()如出一辙,jQuery的选择器基本上都能使用在phpQuery上,只要把“.”变成“->”。如上例中,pq(".blkTop h1:eq(0)")抓取了页面class属性为blkTop的DIV元素,并找到该DIV内部的第一个h1标签,然后用html()方法获取h1标签里的内容(带html标签),也就是我们要获取的头条信息,如果使用text()方法,则只获取头条的文本内容。当然要使用好phpQuery,关键是要找对文档中对应内容的节点。
采集文章列表
下面再来看一个例子,获取helloweba.com网站的blog列表,请看代码:
include 'phpQuery/phpQuery.php';
phpQuery::newDocumentFile('http://www.helloweba.com/blog.html');
$artlist = pq(".blog_li");
foreach($artlist as $li){
echo pq($li)->find('h2')->html()."";
}
通过循环列表中的DIV,找出文章标题并输出,就是这么简单。
解析XML文档
假设现在有一个这样的test.xml文档:
<?xml version="1.0" encoding="utf-8"?>
<root>
<contact>
<name>张三</name>
<age>22</age>
</contact>
<contact>
<name>王五</name>
<age>18</age>
</contact>
</root>
现在我要获取名字为张三的联系人的年龄,代码如下:
include 'phpQuery/phpQuery.php';
phpQuery::newDocumentFile('test.xml');
echo pq('contact > age:eq(0)');
结果输出:22
像jQuery一样,精准查找文档节点,输出节点下的内容,解析一个XML文档就是这么简单。现在你不必为采集网站内容而使用那些头疼的正则算法、内容替换等繁琐的代码了,有了phpQuery,一切就变得轻松多了。