use LWP::UserAgent; use POSIX; use HTML::TreeBuilder::XPath; use Encode; use HTML::TreeBuilder; open DATAFH,">csdn.html" || die "open csdn file failed:$!"; my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->env_proxy; $ua->agent("Mozilla/8.0"); my $response = $ua->get('http://blog.csdn.net/zhaoyangjian724'); my $base_dir="F:\pa"; if ($response->is_success) { print DATAFH $response->content }; use HTML::TreeBuilder::XPath; my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "csdn.html"); ## <a href="/zhaoyangjian724/article/category/1756569" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_wenzhangfenlei']); ">Oracle dump解析 ##获取博客分类的URL,根据a标签查找 @Links = $tree->find_by_tag_name('a'); foreach (@Links) { $href = $_->attr('href'); ###获取博客每个类别的url ####@href 表示所有分类的url if ($href =~/category/){print "$href is $href "; push (@href,$href); }; }; #@href 是所有类别url的汇总 print "@href is @href "; #@type 是类别名称汇总,根据ul标签查找/li/a对应的值 my @type=$tree->findvalues( '/html/body//ul[@class="panel_body"]/li/a'); #my @type=encode("gbk", decode("utf8","@type")); foreach (@type){ my $a=encode("gbk", decode("utf8","$_")) ; push (@a, $a); }; my @type=@a; print "@type is @type "; my $length=@href; my @tmp=(); ##@type 表示所有分类的名称 for ($i=0;$i<$length;$i++){ print "$href[$i]===$type[$i] "; push (@tmp,$type[$i])}; #循环类别开始 for ($i=0;$i<=@type - 1; $i++){ print "$type is $type "; #next unless ($type[$i]) ; if (! -d "$type[$i]"){ mkdir $type[$i]; }; chdir "$base_dir/$type[$i]"; ##进入每个分类版块url my $pageString; my $response = $ua->get("http://blog.csdn.net$href[$i]"); ##每个版块首页url print "$href[$i] is $href[$i] "; ##fh1.html每个版块首页url open fh1,">fh1.html" || die "open csdn file failed:$!"; print fh1 $response->content; close fh1; my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "fh1.html"); ##获取每个版块的页码数 这个方法有问题,这里是数组$_ is 150条数据 共8页 my @pageString = $tree->findvalues('/html/body//div[@id="papelist"]/span'); if ($pageString[0]){ if ($pageString[0] =~ /.*s+.*?(d+).*/){$pageString=$1}; }; print "@pageString is @pageString ;"; ##获取$pageString sleep (5); unless ($pageString){$pageString=1}; print "$pageString is $pageString "; sleep(5); ##进入每页,处理url for ($j=1;$j<=$pageString + 0; $j++){ ##每个类别对应的url my $url="http://blog.csdn.net$href[$i]/$j"; print "$url is $url "; my $response = $ua->get("$url"); ##fh2 每页url open fh2,">fh2.html" || die "open csdn file failed:$!"; print fh2 $response->content; close fh2; #获取每页都多少条标题 my @pageTitles=""; my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "fh2.html"); #获取标题,这里会拿到除了该类别下文章外, my @pageTitles = $tree->findvalues('/html/body//span[@class="link_title"]'); my @a=(); foreach (@pageTitles){ my $a=encode("gbk", decode("utf8","$_")) ; push (@a, $a); }; my @pageTitles=@a; print "$pageTitles[0] is $pageTitles[0] "; print "@pageTitles is @pageTitles "; sleep (10); ##获取标题连接url my $tree= HTML::TreeBuilder::XPath->new; $tree->parse_file( "fh2.html"); @titleLinks=""; @titleLinks=$tree->find_by_tag_name('a'); @urlall=""; @urltmp=""; #@urlall除了包含每个类别的文章,还包含阅读排行里的文章 foreach (@titleLinks) { @titleHref = $_->attr('href'); foreach (@titleHref) { ###获取版块中每个页面的url if ($_ =~/zhaoyangjian724/article/details/(d+)$/){ unless ($_ ~~ @urlall) { print "$_=========$_ ";push (@urlall ,encode("gbk", decode("utf8","$_")));}} }; }; ##第一个元素为空 需要去掉 shift @urlall; print "@urlall is @urlall "; sleep (10); for ($k=0;$k<=@pageTitles - 1;$k++){ print "$urlall[$k] is $urlall[$k] "; push (@urltmp,$urlall[$k]); }; @urlall=@urltmp; shift @urlall; print "$---urlall[0] is $urlall[0] "; sleep (10); for ($m=0;$m<=@urlall - 1; $m++){ $pageTitles[$m] =~ s/s+//g; print "=========================== "; print "$pageTitles[$m]======$urlall[$m] "; print "=========================== "; open fh3,">$pageTitles[$m].html" || die "open csdn file failed:$!"; my $response = $ua->get("http://blog.csdn.net$urlall[$m]"); print "-------------------------------- "; print "$urlall[$m]"." "; print fh3 $response->content; close fh3; # unlink("$pageTitles[$m].html.tmp"); #循环页码结束 }; #循环每个分类的url结束 #循环单个类别结束 } chdir "$base_dir"; }