package main import ( "fmt" "strconv" "net/http" "regexp" "strings" "os" ) //地址的规律 //第一页:https://www.pengfu.com/xiaohua_1.html //第二页:https://www.pengfu.com/xiaohua_2.html //第三页:https://www.pengfu.com/xiaohua_3.html //查看每个页面的源码,可以看到标题的“<h1 class="dp-b"><a href="”有10个,它的实际搜索条件为<h1 class="dp-b"><a href="段子的地址" //进行到段子的地址后,有两个h1,就是标题,但是我在使用的时候发现网页里有两个地方使用了h1的标签,后面过滤一下,只 //可能过滤到两个,只取一个 //内容开头<div class="content-txt pt10"> 结尾<a id="prev" href="https://www.pengfu.com/content_1850587_1.html"></a> func HttpGet(url string) (result string, err error) { resp, err1 := http.Get(url) if err1 != nil { err = err1 return } defer resp.Body.Close() //关闭 //读取网络内容 buf := make([]byte, 4*1024) for true { n, _ := resp.Body.Read(buf) if n == 0 { break } result += string(buf[:n])//累加读取的内容 } return } func SpiderOneJoy(url string) (title, content string, err error) { //爬取页面内容 result, err1 := HttpGet(url) if err1 != nil { fmt.Println("SpiderOneJoy HttpGet err = ", err) err = err1 return } //取关键信息,标题 re1 := regexp.MustCompile(`<h1>(.*?)</h1>`) if re1 == nil { err = fmt.Errorf("%s", "SpiderOneJoy regexp.MustCompile err ") return } //取内容 tmpTitle := re1.FindAllStringSubmatch(result, 1)//因为我只过滤第一个内容 for _, data := range tmpTitle{ title = data[1] title = strings.Replace(title, " ", "", -1) title = strings.Replace(title, " ", "", -1) title = strings.Replace(title, " ", "", -1) title = strings.Replace(title, " ", "", -1) break } //取关键信息,内容 re2 := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?))<a id="prev" href="`) if re2 == nil { err = fmt.Errorf("%s", "SpiderOneJoy regexp.MustCompile err ") return } //取内容 tmpContent := re2.FindAllStringSubmatch(result, -1) for _, data := range tmpContent{ content = data[1] content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, " ", "", -1) content = strings.Replace(content, "<br>", "", -1) content = strings.Replace(content, "<br/>", "", -1) //这种方式不太好,肯定有更好的办法 break } return } func StoreJoyToFile(i int, fileTitile, fileContent []string) { //新建文件 f, err := os.Create(strconv.Itoa(i) + ".txt") if err != nil { fmt.Println("os.Create err = ", err) return } defer f.Close() //写内容 n := len(fileTitile) for i := 0; i < n; i++{ //写标题 f.WriteString(fileTitile[i] + " ") //写内容 f.WriteString(fileContent[i] + " ") f.WriteString("-------------------------------------- ") } } func SpiderPage(i int, page chan <- int) { url := "https://www.pengfu.com/xiaohua_" + strconv.Itoa(i) + ".html" fmt.Printf("正在爬取%s ", url) //爬取页面内容 result, err := HttpGet(url) if err != nil { fmt.Println("HttpGet err = ", err) return } //fmt.Println("r = ", result) //取内容<h1 class="dp-b"><a href="”有10个,它的实际搜索条件为<h1 class="dp-b"><a href="段子的地址",这里只需要地址内容 re := regexp.MustCompile(`<h1 class="dp-b"><a href="(.*?)"`) if re == nil { fmt.Println("regexp.MustCompile err ") return } //解析表达式、并取关键信息 joyUrls := re.FindAllStringSubmatch(result, -1) fileTitle := make([]string, 0) fileContent := make([]string, 0) //fmt.Println("url = ", joyUrls) //取网址,遍历后第一个返回下标,这里不要,第二个返回内容 for _, data := range joyUrls{ fmt.Println("data = ", data[1]) //爬取每一个段子 title, content, err := SpiderOneJoy(data[1])//它应该返回标题和内容,并可能会有报错,共三个返回值 if err != nil { fmt.Println("SpiderOneJoy err ", err) continue } //fmt.Printf("title = #%v# ", title)//使用这种方式打印就能看到结果里是否包含一些空格 //fmt.Printf("content = #%v# ", content) fileTitle = append(fileTitle, title) //将所有的标题放在一个切片里 fileContent = append(fileContent, content) //将所有的内容放在一个内容里 } //将内容写入到文件 StoreJoyToFile(i, fileTitle, fileContent) page <- i //告诉程序是哪一页爬完了 } func DoWork(start, end int) { fmt.Printf("准备爬取第%d页到第%d页的网址 ", start, end) page := make(chan int) for i:=start; i<=end; i++ { //定义一个函数,爬主页面 go SpiderPage(i, page) } for i:=start; i<=end; i++ { fmt.Printf("第%d个页面爬取完成", <-page) } } func main() { var start, end int fmt.Printf("请输入起始页>=1:> ") fmt.Scan(&start) fmt.Printf("请输入结束页:> ") fmt.Scan(&end) DoWork(start, end) }