• perl 爬虫两个技巧


    <pre name="code" class="cpp">jrhmpt01:/root/lwp# cat data.html 
         <div class="m-page J-ajax-page">
            <a class="changePage" page="1" href="javascript:void(0);">首页</a> <a class="changePage" page="11" href="javascript:void(0);">上一页</a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
         </div>
    
        <div class="m-page J-ajax-page">
            <a class="changePage" page="1" href="javascript:void(0);">首页</a> <a class="changePage" page="11" href="javascript:void(0);">上一页</a>  <a class="changePage" page="11" href="javascript:void(0);">11</a>  <a class="cur" href="javascript:void(0)">12</a> <span class="page_info">12/12</span>
         </div>
    	 
    	 
    
    jrhmpt01:/root/lwp# cat c1.pl 
    use  LWP::UserAgent;
    use DBI;  
    use POSIX;
    use Data::Dumper;
    use HTML::TreeBuilder;
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->agent("Mozilla/8.0");
    
    
      use HTML::TreeBuilder::XPath;
       $tree= HTML::TreeBuilder::XPath->new;
      $tree->parse_file( "data.html");
    my @title=  $tree->findvalues('/html/body//a[@class="changePage"]');
    print "@title is @title
    ";
    
    jrhmpt01:/root/lwp# perl c1.pl
    @title is 首页 上一页 11 首页 上一页 11
    
    my @title=  $tree->findvalue('/html/body//a[@class="changePage"]');
    表示 根据body的内容 查找a标签的@class="changePage"的值
    
    
    jrhmpt01:/root/lwp# cat c1.pl 
    use  LWP::UserAgent;
    use DBI;  
    use POSIX;
    use Data::Dumper;
    use HTML::TreeBuilder;
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->env_proxy;
    $ua->agent("Mozilla/8.0");
    
    
      use HTML::TreeBuilder::XPath;
       $tree= HTML::TreeBuilder::XPath->new;
      $tree->parse_file( "data.html");
    my    @pages=$tree->find_by_tag_name('a');
                          #@urlall除了包含每个类别的文章,还包含阅读排行里的文章
                          foreach (@pages) {
                                                   @titlepage = $_->attr('page');
                                                   foreach (@titlepage) {
                                                     if ($_){ 
                                                    print "$_ is $_
    ";
                                                         };
                                               };
    };
    jrhmpt01:/root/lwp# perl c1.pl 
    $_ is 1
    $_ is 11
    $_ is 11
    $_ is 1
    $_ is 11
    $_ is 11
    
    
    根据a标签,查看page属性的值


    
                                        
    
  • 相关阅读:
    再谈spark部署搭建和企业级项目接轨的入门经验(博主推荐)
    CSS基础3——使用CSS格式化元素内容的字体
    利用MySQL 的GROUP_CONCAT函数实现聚合乘法
    POJ Octal Fractions(JAVA水过)
    组件接口(API)设计指南-文件夹
    Nginx 因 Selinux 服务导致无法远程訪问
    host字段变复杂了
    hdu 1251 统计难题 初识map
    “那个人样子好怪。”“我也看到了,他好像一条狗。”
    pomelo 协议
  • 原文地址:https://www.cnblogs.com/hzcya1995/p/13350886.html
Copyright © 2020-2023  润新知