• erlang实现简单爬虫


    -module(spider).
    -compile(export_all).
    -import(lists,[reverse/1,reverse/2,map/2]).
    
    nano_get_url(Host) ->
        {ok,Socket} = gen_tcp:connect(Host, 80, [binary,{packet,0}]),
        ok = gen_tcp:send(Socket,"GET / HTTP/1.0\r\n\r\n"),
        receive_data(Socket,[]).
    
    receive_data(Socket,SoFar) ->
         receive
             {tcp,Socket,Bin} ->
                            receive_data(Socket,[Bin|SoFar]);
             {tcp_closed,Socket} -> 
                     list_to_binary(lists:reverse(SoFar))
         end.
    
    urls2htmlFile(Urls,File) ->
          file:write_file(File,urls2html(Urls)).
    
    bin2urls(Bin) -> gather_urls(binary_to_list(Bin),[]).
    
    bin2urls2(Bin) -> gather_urls2(binary_to_list(Bin),[]).
    
    urls2html(Urls) -> [h1("Urls"),make_list(Urls)].
    
    
    h1(Title) -> ["<h1>",Title,"</h1>\n"].
    
    make_list(L) ->
     ["<u1>\n",
       map(fun(I) -> ["<li>",I,"</li>\n"] end, L),
       "</u1>\n"].
     
    
     gather_urls("<a href" ++ T,L) ->
        {Url,T1}=collect_url_body(T,reverse("<a href")),
         
    %%      case spider1:gather_urls(Url, []) of
    %%                              []->
    %%                                  nothing;
    %%                              [SubUrl]->
    %%                                  io:format("SubUrl:~p~n",[SubUrl]),
    %%                                  make2(SubUrl)
    %%                          end,
      gather_urls(T1,[Url|L]);
        
     gather_urls([_|T],L) -> 
        gather_urls(T,L); 
    
     gather_urls([],L) -> 
        L.
     
     gather_urls2("<a href" ++ T,L) ->
        {Url,T1}=collect_url_body(T,reverse("<a href")),
        gather_urls2(T1,[Url|L]);
    
     gather_urls2("<link href" ++ T,L) ->
        {Url,T1}=collect_url_body(T,reverse("<link href")),
        gather_urls2(T1,[Url|L]);
    
     gather_urls2([_|T],L) -> 
        gather_urls2(T,L);
     gather_urls2([],L) -> 
        L.
    
    collect_url_body("</a>" ++ T,L) -> {reverse(L,"</a>"),T};
    collect_url_body(">" ++ T,L) -> {reverse(L,">"),T};
    collect_url_body([H|T],L)       -> collect_url_body(T,[H|L]);
    collect_url_body([],_)          -> {[],[]}.
    
    make()->
        B=nano_get_url("www.baidu.com"),
        L=bin2urls(B),
        
      MakeSubFun  =  fun(Url)->
                             io:format("Url1:~p~n",[Url]),
                             case spider1:gather_urls(Url, []) of
                                 []->
                                     nothing;
                                 [SubUrl]->
                                     io:format("SubUrl:~p~n",[SubUrl]),
                                     make2(SubUrl)
                             end
                     end,
        lists:foreach(MakeSubFun , L),
        urls2htmlFile(L,"http\\1.html").
    
    make2(SubUrl)->
        B=nano_get_url(SubUrl),
    %%     io:format(B),
        L=bin2urls2(B),
        urls2htmlFile(L,"http\\"++SubUrl++".html").
    -module(spider1).
    
    -compile(export_all).
    
    -import(lists,[reverse/1,reverse/2]).
     
    gather_urls("http://" ++ T,L) ->
        {Url,T1}=collect_url_body(T,reverse("")),
        gather_urls(T1,[Url|L]);
    
    gather_urls([_|T],L) -> 
        gather_urls(T,L);
    gather_urls([],L) -> 
        L.
    
    collect_url_body("/" ++ T, W) -> {reverse(W,""),T};
    collect_url_body("\"" ++ T, W) -> {reverse(W,""),T};
    collect_url_body([Q|T1],W) -> collect_url_body(T1,[Q|W]);
    collect_url_body([],_) -> {[],[]}.

    运行结果如下:

    上述实现抓取链接及它的二级链接,抓取<a href />标签后的链接,只能爬取链接.现在看到不少关于erlang爬虫的文章.可以爬取图片等.代码写的有点儿凌乱,没有优化,仅供参考.

  • 相关阅读:
    Redhat VNCServer
    Petshop4 的相关文章、下载地址和相关问题
    .Net 2.0 中用 ICallbackEventHandler实现 Ajax无刷新操作
    Silverlight 结合WCF Duplex Service聊天程序出炉
    vsftp的虚拟用户管理
    MySql 内存表使用
    linux 上远程控制
    理解WCF Session笔记
    转载-磁盘管理
    rhel 5 3G以上内存解决方案
  • 原文地址:https://www.cnblogs.com/unqiang/p/2748402.html
Copyright © 2020-2023  润新知