原文链接:http://blog.csdn.net/xyzhaopeng/article/details/6626340
从一个HTML页面的一个表格中提取数据并且将这个数据整理出来加入到MySQL数据库中。
假设目标HTML中我感兴趣的Table有3列,分别是ID,Name,内容。
index.php
<pre class="php" name="code"><?php /* * To change this template, choose Tools | Templates * and open the template in the editor. */ $urlTarget = "http://www.xxxx.com/targethtmlpage.html"; require_once('ContentManager.php'); //建立Dom对象,分析HTML文件; $htmDoc = new DOMDocument; $htmDoc->loadHTMLFile($urlTarget ); $htmDoc->normalizeDocument(); //获得到此文档中每一个Table对象; $tables_list = $htmDoc->getElementsByTagName('table'); //测试Table Count; $tables_count = $tables_list->length; foreach ($tables_list as $table) { //得到Table对象的class属性 $tableProp = $table->getAttribute('class'); if ($tableProp == 'target_table_class') { $contentMgr = new ContentManager(); $contentMgr->ParseFromDOMElement($table); //这里myParser就完成了分析动作。然后就可以进行需要的操作了。 //比如写入MySQL。 $contentMgr->SerializeToDB(); } } ?> </pre><br>
ContentManager.php
<?php /* * To change this template, choose Tools | Templates * and open the template in the editor. */ /** * Description of ContentParser * * @author xxxxx */ require_once('ContentInfo.php'); class ContentManager { //put your code here var $ContentList; public function __construct() { $this->ContentList = new ArrayObject(); } public function ParseFromDOMElement(DOMElement $table) { $rows_list = $fundsTable->getElementsByTagName('tr'); $rows_length = $rows_list->length; $index = 0; foreach ($rows_list as $row) { $contentInfo = new ContentInfo(); $contentInfo->ParseFromDOMElement($row); $this->ContentList->append ($contentInfo); } //test how many contents parsed. $count = $this->fundsInfoArray->count(); echo $count; } public function SerializeToDB() { //写入数据库,代码略。 } } ?>
contentinfo.php
<?php /* * To change this template, choose Tools | Templates * and open the template in the editor. */ /** * Description of ContentInfo * * @author xxxxx */ class ContentInfo { //put your code here var $ID; var $Name; var $Content; public function ParseFromDOMElement(DOMElement $row) { $cells_list = $row->getElementsByTagName('td'); $cells_length = $row->length; $curCellIdx = 0; foreach ($cells_list as $cell) { switch ($curCellIdx++) { case 0: $this->ID = $cell->nodeValue; break; case 1: $this->Name = $cell->nodeValue; break; case 2: $this->Content = $cell->nodeValue; break; } } } } ?>