• colly爬虫 go


    package main
    
    import (
    	"bufio"
    	"fmt"
    	"github.com/antchfx/htmlquery"
    	"github.com/gocolly/colly"
    	"github.com/gocolly/colly/extensions"
    	"io/ioutil"
    	"log"
    	"net/http"
    	"os"
    	"regexp"
    	"strings"
    	"sync"
    	"time"
    )
    var wg sync.WaitGroup
    var ch chan int
    
    func main() {
    	ch = make(chan int ,10)
    	var reNotAllow = `http://www.uidzhx.com/du/.*.html`
    	c := colly.NewCollector(
    		colly.AllowedDomains("www.uidzhx.com"),
    		colly.AllowURLRevisit(),
    		colly.IgnoreRobotsTxt(),
    		colly.DisallowedURLFilters(regexp.MustCompile(reNotAllow)),
    		)
    	c.AllowURLRevisit = false
    	c.Async = false
    	extensions.RandomUserAgent(c)
    	extensions.Referer(c)
    
    	c.OnRequest(func(r *colly.Request) {
    		fmt.Println("Visiting", r.URL.String())
    	})
    
    
    	c.Limit(&colly.LimitRule{
    		DomainGlob:  "*",
    		//Parallelism: 2,
    		RandomDelay: 1 * time.Second,
    	})
    	
    	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    		link := e.Attr("href")
    		c.Visit(e.Request.AbsoluteURL(link))
    	})
    
    	//收到响应后
    	c.OnResponse(func(r *colly.Response) {
    
    		doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
    		if err != nil {
    			log.Fatal(err)
    		}
    		title := htmlquery.FindOne(doc, `/html/body/div[4]/div[2]/div[1]/div/div[2]/div/h1`)
    		if title != nil{
    			var reTxt = `http://dzs.uidzhx.com.*.txt`
    			re := regexp.MustCompile(reTxt)
    			url := re.FindString(string(r.Body))
    			if url != "" {
    				txtTitle := strings.Replace(htmlquery.InnerText(title), " ", "", -1)
    				fmt.Println(txtTitle)
    				wg.Add(1)
    				ch <- 1
    				//go saveTxt(txtTitle,url)
    			}
    		}
    	})
    
    
    	c.Visit("http://www.uidzhx.com/Shtml89401.html")
    
    	wg.Wait()
    }
    
    func saveTxt(title string,url string )  {
    	defer wg.Done()
    	str := download(url)
    	fmt.Println(str)
    	fmt.Printf("save txt %s - %s
    ",title,url)
    	filePath := "d:/crawl/"+title+".txt"
    	file, err := os.OpenFile(filePath, os.O_WRONLY | os.O_CREATE, 0666)
    	if err != nil {
    		fmt.Printf("open file err=%v
    ", err)
    		return
    	}
    	//及时关闭file句柄
    	defer file.Close()
    	//写入时,使用带缓存的 *Writer
    	writer := bufio.NewWriter(file)
    	for i := 0; i < 5; i++ {
    		writer.WriteString(str)
    	}
    	<- ch
    }
    
    func download(url string) string {
    	client := &http.Client{}
    	req,_ := http.NewRequest("GET",url,nil)
    
    	req.Header.Set("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)")
    	resp,err := client.Do(req)
    	if err != nil{
    		fmt.Print("http get err",err)
    		panic("http get err")
    	}
    
    	defer resp.Body.Close()
    
    	body,err := ioutil.ReadAll(resp.Body)
    	if err != nil{
    		fmt.Print("read error ",err)
    		panic("read error")
    	}
    	return string(body)
    }
    
    
    
  • 相关阅读:
    舍不得花钱的心理分析
    DLL编程的导入导出,__declspec(dllimport),__declspec(dllexport)
    浅谈C/C++内存泄漏及其检测工具
    C++多线程编程简单实例
    linux镜像源设置
    Linux基础教程 linux无密码ssh登录设置
    兄弟连教育分享:用CSS实现鼠标悬停提示的方法
    PHP基础教程 PHP的页面缓冲处理机制
    Linux基础教程 linux下cat 命令使用详解
    PHP基础教程 php 网络上关于设计模式一些总结
  • 原文地址:https://www.cnblogs.com/brady-wang/p/14005679.html
Copyright © 2020-2023  润新知