帮朋友爬取豆瓣电影的介绍里面的内容,废话不多说了,上代码
简单的爬取分为两个文件
fectch.php
<?php
require "./getfunction.php";
$name = "复仇者联盟3:无限战争";
$url = "https://movie.douban.com/j/subject_suggest?q=".$name;
$curl = curl_init(); // 启动一个CURL会话
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); // 从证书中检查SSL加密算法是否存在
$tmpInfo = curl_exec($curl); //返回api的json对象
$tmpInfo = json_decode($tmpInfo);
// var_dump($tmpInfo);die;
$arrat_res = [];
foreach ($tmpInfo as $v) {
if ($name == $v->title) {
$arrat_res[] = $v;
}
}
if (empty($arrat_res)) {
$data = [
"code"=>10001,
"msg"=>"暂无片源信息"
];
echo json_encode($data);die;
}
$url2 = $arrat_res[0]->url;
curl_setopt($curl, CURLOPT_URL, $url2);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); // 从证书中检查SSL加密算法是否存在
$tmpInfo2 = curl_exec($curl); //返回api的json对象
if (!$tmpInfo2) {
echo "<br />cURL error number:" .curl_errno($curl);
echo "<br />cURL error:" . curl_error($curl);
exit;
}
//创建一个DomDocument对象,用于处理一个HTML
$dom = new DOMDocument();
//从一个字符串加载HTML
@$dom->loadHTML($tmpInfo2);
//使该HTML规范化
$dom->normalize();
//用DOMXpath加载DOM,用于查询
$xpath = new DOMXPath($dom);
//获取导演信息
$directors = $xpath->evaluate("//*[@id='info']/span[1]/span[2]/a/text()");
$directors_res = "";
for ($i = 0; $i < $directors->length; $i++) {
$director = $directors->item($i);
$director = $director->nodeValue;
if ($i != 0) {
$directors_res = $directors_res.",".$director;
}else{
$directors_res = $director;
}
}
//名称
$name = $xpath->evaluate("//*[@id='content']/h1/span[1]/text()");
if (!empty($name->length)) {
$name = $name->item(0)->nodeValue;
}
//年份
$years = $xpath->evaluate("//*[@id='content']/h1/span[2]/text()");
if (!empty($years->length)) {
$years = $years->item(0)->nodeValue;
}
//海报
//*[@id="mainpic"]/a/img
$img = $xpath->evaluate("//*[@id='mainpic']/a/img/@src");
if (!empty($img->length)) {
$img = $img->item(0)->nodeValue;
}
// var_dump($img);die;
//是否上映
//*[@id="interest_sectl"]/div/div[2]/div/div[2]
$is_on = $xpath->evaluate("//*[@id='interest_sectl']/div/div[2]/div/div[2]");
if (!empty($is_on->length)) {
$is_on = $is_on->item(0)->nodeValue;
if (trim($is_on) == "尚未上映") {
$is_on = 1;
}else{
$is_on = 2;
}
}
// var_dump($is_on);die;
//获取编剧信息
$screenwriters = $xpath->evaluate("//*[@id='info']/span[2]/span[2]/a/text()");
$screenwriters_res = "";
for ($i = 0; $i < $screenwriters->length; $i++) {
$screenwriter = $screenwriters->item($i);
$screenwriter = $screenwriter->nodeValue;
if ($i != 0) {
$screenwriters_res = $screenwriters_res. ",".$screenwriter;
}else{
$screenwriters_res = $screenwriter;
}
}
//获取演员信息
//*[@id="info"]/span[3]/span[2]/span[1]/a
$actors = $xpath->query("//*[@id='info']/span[3]/span[2]");
$actors_res = "";
for ($i = 0; $i < $actors->length; $i++) {
$actor = $actors->item($i);
$actor = $actor->nodeValue;
if ($i != 0) {
$actors_res = $actors_res. ",".$actor;
}else{
$actors_res = $actor;
}
}
// $types = $xpath->query("//*[@id='info']/span[30]");
// var_dump($types->item(0)->nodeValue);die;
//获取类型
$getfunction = new getFunction();
$sear_res = $getfunction->getRes(5,"制片国家/地区:",$xpath);
$types_res = $sear_res["res"];
$num = $sear_res["num"];
//获取语言
$attr = [];
$langs = $xpath->evaluate("//*[@id='info']/text()");
for ($i = 0; $i < $langs->length; $i++) {
$lang = $langs->item($i);
$lang = $lang->nodeValue;
if (preg_match('/[x{4e00}-x{9fa5}]/u', $lang)>0) {
$attr [] = $lang;
}
}
// var_dump($attr);die;
// if (count($attr) == 3) {
// // code...
// }
if ($is_on == 1) {
$show_res = "";
$sear2_res = $getfunction->getRes($num+4,"又名:",$xpath);
$time_res = $sear2_res["res"];
$num = $sear2_res["num"];
}else{
//获取上映时间
$sear2_res = $getfunction->getRes($num+4,"片长:",$xpath);
$time_res = $sear2_res["res"];
$num = $sear2_res["num"];
//时长
$sear3_res = $getfunction->getRes($num+1,"又名:",$xpath);
$show_res = $sear3_res["res"];
$num = $sear3_res["num"];
}
if (count($attr) == 4) {
$show_res = $show_res.$attr[2];
$country = $attr[0];
$languages = $attr[1];
$byname = $attr[3];
}else{
$country = $attr[0];
$languages = $attr[1];
$byname = $attr[2];
}
$imbd = "";
$urlim = $xpath->evaluate("//*[@id='info']/a[2]/@href");
if (!empty($urlim->length)) {
$urlim = $urlim->item(0)->nodeValue;
//获取url
$urls = "";
$urls = $xpath->evaluate("//*[@id='info']/a[1]/@href");
if (!empty($urls->length)) {
$urls = $urls->item(0)->nodeValue;
}
}else{
$urls = "";
$urlim = $xpath->evaluate("//*[@id='info']/a[1]/@href");
if (!empty($urlim->length)) {
$urlim = $urlim->item(0)->nodeValue;
}
}
$final_res = [
"all_name" => $name.$years,
"name" => $name,
"year" => $years,
"img" => $img,
"directors" => $directors_res,
"screenwriters" => $screenwriters_res,
"actors" => $actors_res,
"types" => $types_res,
"web_url" => $urls,
"country" => $country,
"languages" => $languages,
"ontime" => $time_res,
"showtime" => $show_res,
"byname" => $byname,
"imbd" => $urlim
];
$return = ["code"=>0, "msg"=>"抓取成功", "data"=>$final_res ];
echo json_encode($return);
getfunction.php
<?php
class getFunction{
public static function getRes($start,$key,$xpath){
$res = "";
$num = "";
// $key = "官方网站:";
for($i = $start; $i<30; $i++ ){
$types = $xpath->query("//*[@id='info']/span[".$i."]");
if (!empty($types->length)) {
$info_res = $types->item(0)->nodeValue;
if ($info_res == $key) {
$num = $i;
}elseif ($info_res == "官方网站:") {
$num = $i;
}else{
if(empty($num)){
if ($i != $start) {
$res = $res. ",".$info_res;
}else{
$res = $info_res;
}
}
}
}
}
$data = ["res"=>$res,"num"=>$num];
return $data;
}
}