• 对网站的代码采集实例


    1.采集的网站:http://www.abnova.com/support/publication.asp

    2. 相关的代码:列表(规则采集)页面使用:phpQuery.php,可以参考:PHP curl_setopt函数用法介绍中篇

    3.产品详情页面:信息(不规则采集),参考代码如下:

    <?php
    header('Content-Type:text/html;charset=UTF-8');
    include 'phpQuery/phpQuery.php';
    set_time_limit(0);
    $id = isset($_GET['id']) ? intval($_GET['id']) : 1;
    
    $listArr = file('list.txt');
    
    if (!array_key_exists($id,$listArr)){
        echo 'finished';
        exit;
    }
    
    
    $url = preg_replace('/[
    ]+/','',$listArr[$id]);
    phpQuery::newDocumentFile($url);
    
    $artList = pq("#sub_product_info");
    echo '<pre>';
    
    foreach($artList as $li){
        $data = array();
        $datacode = array();
        $datacode = explode('=',$url);
        $data['code'] = $datacode[1];
        $one = '';
        $one_a = '';
        $one_a = pq($li)->find("#10000 b")->html();
        $one_a = trim(strip_tags($one_a));
        if($one_a == 'Product Description:'){
            // echo 1;
         $one = pq($li)->find('#10000 li')->eq(1)->html();
         $one =  trim(strip_tags($one));
         echo "Product Description:   ".$one;   
        }else{
            echo "Product Description:   ".$one;
            // echo "wrong!<br/>";
        }
        $data['Description'] = $one;
         echo '<br/>';
        #########################
        $two = '';
        $two_a = '';
        $two_a = pq($li)->find("#90000 b")->html();
        $two_a = trim(strip_tags($two_a));
        if($two_a == 'Immunogen:'){
    
         $two = pq($li)->find('#90000 li')->eq(1)->html();
         $two =  trim(strip_tags($two));
         echo "Immunogen:   ".$two;   
    
        }else{
            echo "Immunogen:   ".$two;
            // echo "wrong<br/>";
        }
        $data['Immunogen'] = $two;
         echo '<br/>';
        #########################
        $three = '';
        $three_a = '';
        $three_a = pq($li)->find("#110000 b")->html();
        $three_a = trim(strip_tags($three_a));
        if($three_a == 'Host:'){
    
         $three = pq($li)->find('#110000 li')->eq(1)->html();
         $three =  trim(strip_tags($three));
         echo "Host:   ".$three;   
    
        }else{
            echo "Host:   ".$three;
            // echo "wrong<br/>";
        }
        $data['Host'] = $three;
         echo '<br/>';
         #########################
         $four = '';
         $four_a = '';
        $four_a = pq($li)->find("#130000 b")->html();
        $four_a = trim(strip_tags($four_a));
        if($four_a == 'Reactivity:'){
    
         $four = pq($li)->find('#130000 li')->eq(1)->html();
         $four =  trim(strip_tags($four));
         echo "Reactivity:   ".$four;   
    
        }else{
            echo "Reactivity:   ".$four;
            // echo "wrong<br/>";
        }
        $data['Reactivity'] = $four;
         echo '<br/>';
         #########################
        $five = '';
        $five_a ='';
        $five_a = pq($li)->find("#240000 b")->html();
        $five_a = trim(strip_tags($five_a));
        if($five_a == 'Isotype:'){
    
         $five = pq($li)->find('#240000 li')->eq(1)->html();
         $five =  trim(strip_tags($five));
         echo "Isotype:   ".$five;   
    
        }else{
            echo "Isotype:   ".$five;
            // echo "wrong<br/>";
        }
        $data['Isotype'] = $five;
         echo '<br/>';
         #########################
    
        $six = '';
        $six_a = '';
        $six_a = pq($li)->find("#290000 b")->html();
        $six_a = trim(strip_tags($six_a));
        if($six_a == 'Quality Control Testing:'){
    
         $six_all = pq($li)->find('#290000 li')->eq(1)->html();
         $six_all =  trim(strip_tags($six_all,"<br>"));
         // $six_all = str_replace("<br><br><br/>",'###',)
         $six_arr = explode("<br><br><br>",$six_all);
         // var_dump($six_arr);
         $six = trim($six_arr[0]);   
         echo "Quality Control Testing:    ".$six;   
        
        }else{
            echo "Quality Control Testing:    ".$six;
            // echo "wrong<br/>";
        }
        $data['Testing'] = $six;
         echo '<br/>';
         echo '<hr/>';
         #########################
    
         $wh_11 = '';
         $wh_11 = pq($li) -> find(".part")->eq(2)->find(".first_title b")->html();
         $wh_11 = trim(strip_tags($wh_11));
         echo "APP:   ".$wh_11;
         echo '<br/>';
    
         $wh_22 = '';
         $wh_22 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(1)->find("li")->eq(0)->html();
         $wh_22 = trim(strip_tags($wh_22));
         echo "Western:   ".$wh_22;
         echo '<br/>';
    
         $wh_33 = '';
         $wh_33 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(2)->find("li")->eq(0)->html();
         $wh_33 = trim(strip_tags($wh_33));
         echo "Western Blot:   ".$wh_33;
         echo '<br/>';
    
         $wh_44 = '';
         $wh_44 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(3)->find("li")->eq(0)->html();
         $wh_44 = trim(strip_tags($wh_44));
         echo "Immunohistochemistry:    ".$wh_44;
         echo '<br/>';
    
         $wh_55 = '';
         $wh_55 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(4)->find("li")->eq(0)->html();
         $wh_55 = trim(strip_tags($wh_55));
         echo "Immunofluorescence:    ".$wh_55;
         echo '<br/>';
    
         $wh_66 = '';
         $wh_66 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(5)->find("li")->eq(0)->html();
         $wh_66 = trim(strip_tags($wh_66));
         echo "Sandwich ELISA:   ".$wh_66;
         echo '<br/>';
    
         $wh_77 = '';
         $wh_77 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(6)->find("li")->eq(0)->html();
         $wh_77 = trim(strip_tags($wh_77));
         echo "ELISA:   ".$wh_77;
         echo '<br/>';
        $app = array();
        $appstr = '';
        if ($wh_22 != '') $app['w1'] = $wh_22;
        if ($wh_33 != '') $app['w2'] = $wh_33;
        if ($wh_44 != '') $app['w3'] = $wh_44;
        if ($wh_55 != '') $app['w4'] = $wh_55;
        if ($wh_66 != '') $app['w5'] = $wh_66;
        if ($wh_77 != '') $app['w6'] = $wh_77;
        echo $appstr = implode(',',$app);
        $data['app'] = $appstr;
         echo '<hr/>';
         #########################
         // $length = pq($li) -> find(".part")->eq(3)->find("ul")->find("li")->html();
         // $length = trim(strip_tags($length,'<b>'));
         // $length = str_replace("<b>","####",$length);
         // // $length = str_replace("</b>",",",$length);
         // echo $length;
         // $arr = explode(",",$length);
         // var_dump($arr);
    
         // foreach($length as $list){
         //  echo $list;
         //  // exit;
         // }
          echo '<hr/>';
         echo '<br/>';
    
         $heng_11 = '';
         $heng_11_a = '';
         $heng_11_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(0)->find("b")->html();
         $heng_11_a = trim(strip_tags($heng_11_a));
    
         if($heng_11_a == 'Entrez GeneID:'){
          $heng_11 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(1)->find("a")->html();
          $heng_11 = trim(strip_tags($heng_11));
          echo "Entrez GeneID:   ".$heng_11;
         }else{
             echo "Entrez GeneID:   ".$heng_11;
         }
        $data['GeneID'] = $heng_11;
         echo '<br/>';
    
         $heng_22 = '';
         $heng_22_a = '';
         $heng_22_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(0)->find("b")->html();
         $heng_22_a = trim(strip_tags($heng_22_a));
    
         if($heng_22_a == 'GeneBank Accession#:'){
         $heng_22 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(1)->find("a")->html();
         $heng_22 = trim(strip_tags($heng_22));
         echo "GeneBank Accession#:   ".$heng_22;
         }else{
             echo "GeneBank Accession#:   ".$heng_22;
         }
         $data['GeneBank Accession'] = $heng_22;
         echo '<br/>';
    
         $heng_33 = '';
         $heng_33_a = '';
         $heng_33_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(0)->find("b")->html();
         $heng_33_a = trim(strip_tags($heng_33_a));
    
         if($heng_33_a == 'Protein Accession#:'){
         $heng_33 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(1)->find("a")->html();
         $heng_33 = trim(strip_tags($heng_33));
         echo "Protein Accession#:   ".$heng_33;
         }else{
             echo "Protein Accession#:   ".$heng_33;
         }
         $data['Protein Accession'] = $heng_33;
         echo '<br/>';
    
          $heng_44 = '';
          $heng_44_a = '';
          $heng_44_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(0)->find("b")->html();
          $heng_44_a = trim(strip_tags($heng_44_a));
          if($heng_44_a == 'Gene Name:'){
            $heng_44 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(1)->html();
             $heng_44 = trim(strip_tags($heng_44));
             echo "Gene Name:   ".$heng_44;
          }else{
              echo "Gene Name:    ".$heng_44;
          }
        $data['Gene Name'] = $heng_44;
         echo '<br/>';
    
          $heng_55 = '';
          $heng_55_a = '';
          $heng_55_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(0)->find("b")->html();
          $heng_55_a = trim(strip_tags($heng_55_a));
    
          if($heng_55_a == 'Gene Alias:'){
            $heng_55 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(1)->html();
            $heng_55 = trim(strip_tags($heng_55));
            echo "Gene Alias:   ".$heng_55;
          }else{
              echo "Gene Alias:  ".$heng_55;
          }
    $data['Gene Alias'] = $heng_55;
         
         echo '<br/>';
    
          $heng_66 = '';
          $heng_66_a = '';
          $heng_66_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(0)->find("b")->html();
          $heng_66_a = trim(strip_tags($heng_66_a));
          if($heng_66_a == 'Omim ID:'){
            $heng_66 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(1)->html();
            $heng_66 = trim(strip_tags($heng_66));
            echo "Omim ID:  ".$heng_66;
          }else{
              echo "Omim ID:  ".$heng_66;
          }
    $data['Omim ID'] = $heng_66;
         echo '<br/>';
    
          $heng_77 = '';
          $heng_77_a = '';
          $heng_77_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(0)->find("b")->html();
          $heng_77_a = trim(strip_tags($heng_77_a));
          if($heng_77_a == 'Gene Ontology:'){
            $heng_77 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(1)->find("a")->html();
            $heng_77 = trim(strip_tags($heng_77));
            echo "Gene Ontology:   ".$heng_77;
          }else{
              echo "Gene Ontology:   ".$heng_77;
          }
        $data['Gene Ontology'] = $heng_77;
         
         
        
         # 获取文献
         $rarr = array();
         $Reference = '';
         if (preg_match('/Publication Reference/',$li->textContent)){
             preg_match_all('/Publication Reference(.*?)Applications/',preg_replace('/[
    ]+/','',$li->textContent),$rarr);
         }
        $Reference = $rarr[1][0];
        $data['Reference'] = $Reference;
        
        $rarr = array();
        print_r($data);
        
        # 写入文件 
        $handle = fopen('list-new.csv','a');
        fputcsv($handle,$data);
        fclose($handle);
        
    }
    
    ?>
    <script>
    function JumpUrl(){
        location.href='?id=<?php echo ($id+1);?>';
    }
    setTimeout('JumpUrl()',0);
    </script>
  • 相关阅读:
    linux (debian) 配置静态ip
    使用yum高速部署Oracle安装环境(11g)
    shell 例程 —— 解决redis读取稳定性
    面向对象的设计模式(六),状态模式
    python调用shell命令之三慷慨法
    Codeforces Round #316 (Div. 2) C. Replacement(线段树)
    vim随想笔记(1)
    JavaSE入门学习12: Java面相对象之static使用方法
    软件架构————架构核对表
    模板管理类
  • 原文地址:https://www.cnblogs.com/wuheng1991/p/5213089.html
Copyright © 2020-2023  润新知