• Perl 关键词搜索机器人


    这段代码在网上找的。觉得很不错,准备弄来分析下。。

    看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。

    特别是一些自己没有用过的模块,通过这些实例就能知道怎么去使用。

    当然,你也可以自己去研究官方那些文档。但是对于我来说,我觉得最快的方法就是看别人写的代码实例。

    或许每个人都有点不同吧。

    #!/usr/bin/perl
     # siteindexingbot.pl
     use warnings;
     use strict;
    
     use LWP::Simple;
     use LWP::RobotUA;
     use WWW::RobotRules;
     use HTML::Parse;
     use HTML::HeadParser;
     use URI::URL;
    
     my ($response, $tree, $link, %scanned);
    
     # the arrays and hashes used to store page data
     my (@pages, %titles, %keywords);
    
     my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n";
     my $base_url = &globalize_url('/', $url);
     my $robots_txt = $base_url . '/robots.txt';
    
     my $robot_rules = new WWW::RobotRules (
        "indexifier/1.0 (libwww-perl-$LWP::VERSION)"
     );
    
     # look for and parse the robots.txt file
     if (head($robots_txt)) {
        print "robots.txt file found OK.\n";
        $robot_rules->parse($robots_txt, get($robots_txt));
     } else {
        print "robots.txt file not found.\n";
     }
    
     # build the user agent
     my $ua = new LWP::UserAgent (
        "indexifier/1.0 (libwww-perl-$LWP::VERSION)",
        'me@here.com',
        $robot_rules
     );
    
     #$ua->proxy('http' => 'http://proxy.mylan.com/' );
     $ua->timeout(30);
     $ua->max_size(1024 * 100);
     $ua->parse_head('TRUE');
    
     &scan($base_url);
    
     open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!";
     foreach my $page(@pages) {
        print FILE join( "\t",
        ($page, $titles{$page}, $keywords{$page})
        ), "\n";
     }
     close (FILE);
    
     exit;
    
     sub scan {
        my $url = shift;
        print "Scanning '$url':\n";
        if ($scanned{$url}) {
           return;
        } else {
           &get_info($url);   # this is the extra subroutine
           $scanned{$url} = 'TRUE';
           my @links = &get_links($url);
           foreach $link(@links) {
              if ($robot_rules->allowed($link)) {
                 if ($link =~ /^$base_url/i) {
                    my $request = HTTP::Request->new ('HEAD' => $link);
                    my $response = $ua->request($request);
                    my $content_type = $response->header('Content-type');
                    if ($response->is_error) {
                       print "Dead link to $link found on $url\n";
                    } else {
                       print "$url links to $link\n";
                       if ($content_type eq 'text/html') {
                          &scan($link);
                       } else {
                          print "$link is not HTML\n";
                       }
                    }
                 } else {
                    print "$link is not local to $base_url\n";
                 }
              } else {
                 print "Access to $link is not allowed by robots.txt\n";
              }
           }
        }
     return;
     }
    
     sub globalize_url {
        my ($link, $referring_url) = @_;
        my $url_obj = new URI::URL($link, $referring_url);
        my $absolute_url = $url_obj->abs->as_string;
        $absolute_url =~ s/^(.+?)#(.+?)$/$1/ig;
        return $absolute_url;
     }
    
     sub get_links {
        my $url = shift;
        my $request = HTTP::Request->new ('GET' => $url);
        $request->header('Accept' => 'text/html');
        my $response = $ua->request($request);
        my $tree = HTML::Parse::parse_html($response->content);
        my $links_ref = $tree->extract_links('a', 'frame', 'iframe');
        my @links;
        foreach $link(sort @$links_ref) {
           push(@links, &globalize_url(${$link}[0], $url));
        }
     return @links;
     }
    
     sub get_info {
        my $url = shift;
        my $request = HTTP::Request->new('GET' => $url);
        $request->header('Accept' => 'text/html');
        my $response = $ua->request($request);
        my $html = $response->content;
        my ($title, $keywords, $type);
        my $parser = HTML::HeadParser->new;
        $parser->parse($html);
        $title = $parser->header('title') || 'Untitled Document';
        $keywords = $response->header('X-Meta-description') || 'none';
        push (@pages, $url);
        $titles{$url} = $title;
        $keywords{$url} = $keywords;
        return;
     }
    
  • 相关阅读:
    操作系统
    OSI协议
    5、hadoop常见端口汇总
    20、Linux 常用命令ll失效
    java 图形化工具Swing 监听键盘输入字符触发动作getInputMap();getActionMap();
    java 图形化工具Swing 基本使用
    java 图形化小工具Abstract Window Toolit ImageIO缩放图片,添加水印
    java 图形化小工具Abstract Window Toolit 画笔 处理位图
    java 图形化小工具Abstract Window Toolit :画笔Graphics,画布Canvas(),弹球小游戏
    java 图形化小工具Abstract Window Toolit 菜单项
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/2934512.html
Copyright © 2020-2023  润新知