Perl 关键词搜索机器人

这段代码在网上找的。觉得很不错，准备弄来分析下。。

看别人的代码也是一种另类的学习方法。在学习的过程当中多看别人的代码能够提升自己的理解。

特别是一些自己没有用过的模块，通过这些实例就能知道怎么去使用。

当然，你也可以自己去研究官方那些文档。但是对于我来说，我觉得最快的方法就是看别人写的代码实例。

或许每个人都有点不同吧。

#!/usr/bin/perl
 # siteindexingbot.pl
 use warnings;
 use strict;

 use LWP::Simple;
 use LWP::RobotUA;
 use WWW::RobotRules;
 use HTML::Parse;
 use HTML::HeadParser;
 use URI::URL;

 my ($response, $tree, $link, %scanned);

 # the arrays and hashes used to store page data
 my (@pages, %titles, %keywords);

 my $url = $ARGV[0] or die "Usage: siteindexingbot [url]\n";
 my $base_url = &globalize_url('/', $url);
 my $robots_txt = $base_url . '/robots.txt';

 my $robot_rules = new WWW::RobotRules (
    "indexifier/1.0 (libwww-perl-$LWP::VERSION)"
 );

 # look for and parse the robots.txt file
 if (head($robots_txt)) {
    print "robots.txt file found OK.\n";
    $robot_rules->parse($robots_txt, get($robots_txt));
 } else {
    print "robots.txt file not found.\n";
 }

 # build the user agent
 my $ua = new LWP::UserAgent (
    "indexifier/1.0 (libwww-perl-$LWP::VERSION)",
    'me@here.com',
    $robot_rules
 );

 #$ua->proxy('http' => 'http://proxy.mylan.com/' );
 $ua->timeout(30);
 $ua->max_size(1024 * 100);
 $ua->parse_head('TRUE');

 &scan($base_url);

 open (FILE, ">indexed.txt") or die "Opening indexed.txt: $!";
 foreach my $page(@pages) {
    print FILE join( "\t",
    ($page, $titles{$page}, $keywords{$page})
    ), "\n";
 }
 close (FILE);

 exit;

 sub scan {
    my $url = shift;
    print "Scanning '$url':\n";
    if ($scanned{$url}) {
       return;
    } else {
       &get_info($url);   # this is the extra subroutine
       $scanned{$url} = 'TRUE';
       my @links = &get_links($url);
       foreach $link(@links) {
          if ($robot_rules->allowed($link)) {
             if ($link =~ /^$base_url/i) {
                my $request = HTTP::Request->new ('HEAD' => $link);
                my $response = $ua->request($request);
                my $content_type = $response->header('Content-type');
                if ($response->is_error) {
                   print "Dead link to $link found on $url\n";
                } else {
                   print "$url links to $link\n";
                   if ($content_type eq 'text/html') {
                      &scan($link);
                   } else {
                      print "$link is not HTML\n";
                   }
                }
             } else {
                print "$link is not local to $base_url\n";
             }
          } else {
             print "Access to $link is not allowed by robots.txt\n";
          }
       }
    }
 return;
 }

 sub globalize_url {
    my ($link, $referring_url) = @_;
    my $url_obj = new URI::URL($link, $referring_url);
    my $absolute_url = $url_obj->abs->as_string;
    $absolute_url =~ s/^(.+?)#(.+?)$/$1/ig;
    return $absolute_url;
 }

 sub get_links {
    my $url = shift;
    my $request = HTTP::Request->new ('GET' => $url);
    $request->header('Accept' => 'text/html');
    my $response = $ua->request($request);
    my $tree = HTML::Parse::parse_html($response->content);
    my $links_ref = $tree->extract_links('a', 'frame', 'iframe');
    my @links;
    foreach $link(sort @$links_ref) {
       push(@links, &globalize_url(${$link}[0], $url));
    }
 return @links;
 }

 sub get_info {
    my $url = shift;
    my $request = HTTP::Request->new('GET' => $url);
    $request->header('Accept' => 'text/html');
    my $response = $ua->request($request);
    my $html = $response->content;
    my ($title, $keywords, $type);
    my $parser = HTML::HeadParser->new;
    $parser->parse($html);
    $title = $parser->header('title') || 'Untitled Document';
    $keywords = $response->header('X-Meta-description') || 'none';
    push (@pages, $url);
    $titles{$url} = $title;
    $keywords{$url} = $keywords;
    return;
 }

相关阅读:
操作系统
 OSI协议
 5、hadoop常见端口汇总
 20、Linux 常用命令ll失效
 java 图形化工具Swing 监听键盘输入字符触发动作getInputMap()；getActionMap()；
java 图形化工具Swing 基本使用
 java 图形化小工具Abstract Window Toolit ImageIO缩放图片，添加水印
 java 图形化小工具Abstract Window Toolit 画笔处理位图
 java 图形化小工具Abstract Window Toolit ：画笔Graphics，画布Canvas()，弹球小游戏
 java 图形化小工具Abstract Window Toolit 菜单项
原文地址：https://www.cnblogs.com/xiaoCon/p/2934512.html