• 实验楼的php比赛题,网页数据提取。


    实验楼的php比赛题,网页数据提取。

    题目的地址:https://www.shiyanlou.com/contests/lou5/challenges

    以下代码是题目的答案

    <?php
    header("Content-Type:text/html;charset=utf-8");
    class Crawler{
    	 private $content;
    	 private $data;
    	 static private $mysql;
    
    	 public function __construct(){
    	 	echo "开始爬取内容....";
    	 }
    
    	 public function loadFile($file_path){
    	 	echo "正在加载文件";
    	 	$this->content = file_get_contents($file_path);
    	 }
    
    	 public function parseCourseBody(){
    	 	$regex = "/<body[^>]*?>(.*s*?)</body>/is";
    	 	if(preg_match_all($regex, $this->content, $matches)){
    	 		$this->content = $matches[0];
    	 	}
    	 }
    
    	 public function parseContent(){
    	 	echo "开始解析内容...<br/>";
    	 	$this->parseCourseBody();
    	 	$this->parseTitle();
    	 	$this->parseDesc();
    	 	$this->parseType();
    	 	$this->titleIsLong();
    	 	$this->saveData();
    	 	echo "解析内容结束!<br/>";
    	 }
    
    	 public function saveData(){
    	 	echo "存入数据库...<br/>";
    	 	self::$mysql = mysql_connect("localhost","root","root");
    	 	mysql_query("set names utf8");
    	 	mysql_select_db("databases",self::$mysql);
    	 	$cnames = $this->data['cnames'];
    	 	$cdescs = $this->data['cdescs'];
    	 	$ctypes = $this->data['ctypes'];
    	 	$nlongs = $this->data['nlongs'];
    	 	foreach ($cnames as $key => $value) {
    	 		$sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values('".$cnames[$key]."','".$cdescs[$key]."','".$ctypes[$key]."','".$nlongs[$key]."')";
    	 		mysql_query($sql);
    	 	}
    	 	mysql_close();
    	 }
    
    	 public function parseTitle(){
    	 	echo "解析课程标题...<br/>";
    	 	$regex= "/<div class="course-name".*?>.*?</div>/ism";   
    		if(preg_match_all($regex, $this->content, $matches)){
    			$cnames = $matches[0];
    		}
    		foreach ($cnames as &$value) {
    			$value = str_replace("</div>","",str_replace("<div class="course-name">", "", $value));
    		}
    		$this->data['cnames'] = $cnames;
    	 }
    
    	 public function parseDesc(){
    	 	echo "解析课程简介...<br/>";
    	 	$regex4= "/<div class="course-desc".*?>.*?</div>/ism";   
    		if(preg_match_all($regex, $this->content, $matches)){
    			$cdescs = $matches[0];
    		}
    		foreach ($cdescs as &$value) {
    			$value = str_replace("</div>","",str_replace("<div class="course-desc">", "", $value));
    		}
    		$this->data['cdescs'] = $cdescs;
    	 }
    
    	 public function parseType(){
    	 	echo "解析课程类型...<br/>";
    	 	$regex= "/<div class="course-footer".*?>.*?</div>/ism";   
    		if(preg_match_all($regex, $this->content, $matches)){
    			$ctypes = $matches[0];
    		}
    		foreach ($ctypes as &$value) {
    			$str = str_replace("</div>","",str_replace("<div class="course-footer">", "", $value));
    			if(preg_match_all("/([x{4e00}-x{9fa5}])/u", $str, $match)){
    				$value = join("",$match[0]);
    			}else{
    				$value = "免费";
    		}
    		$this->data['ctypes'] = $ctypes;
    	 }
    
    	 public function titleIsLong(){
    	 	echo "判断课程名是否超长...<br/>";
    	 	$cnames = $this->data['cnames'];
    	 	foreach ($cnames as $value) {
    	 		$nlongs[] = mb_strlen($value) > 16 : "true" : "false";
    	 	}
    	 	$this->data['nlongs'] = $nlongs;
    	 }
    }
    $Crawler = new Crawler();
    $Crawler->loadFile("test.html");
    $Crawler->parseContent();
    
    /**
     表结构
    cname(varchar):完整的课程名
    cdesc(varchar):课程描述
    ctype(varchar):课程类型,值为 免费,会员,训练营。
    nlong(enum('true','false')):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false
    
    create table `course_data`(
    	`id` int(11) not null auto_increment,
    	`cname` varchar(255) default null,
    	`cdesc` varchar(255) default null,
    	`ctype` varchar(255) default null,
    	`nlong` enum('true','false') default null,
    	primary key (`id`)
    )engine=InnoDB default charset=utf8;
    */
    

      

  • 相关阅读:
    《英语语法新思维初级教程》学习笔记(七)五种基本句型
    《英语语法新思维初级教程》学习笔记(六)实义动词与(情态)助动词
    《英语语法新思维初级教程》学习笔记(五)形容词
    《英语语法新思维初级教程》学习笔记(四)数量限定词和个体限定词
    C# Redis存Session Hash存对象
    MVC中Spring.net 对基类控制器无效 过滤器控制器无效
    C# datagridview列绑定类中类的属性
    商品评分效果JavaScript
    C# SQL数据库学习时遇到到一些异常
    C语言用一维数组打印杨辉三角(原:无意中想到)
  • 原文地址:https://www.cnblogs.com/yxhblogs/p/6878366.html
Copyright © 2020-2023  润新知