• bs4爬虫入门


     1 # -*- coding: utf-8 -*-
     2 """
     3 Created on Fri Nov 16 13:35:33 2018
     4 
     5 @author: zhen
     6 """
     7 import urllib
     8 import urllib.request
     9 from bs4 import BeautifulSoup
    10 
    11 # 设置目标rootUrl,使用urllib.request.Request创建请求
    12 rootUrl = "https://www.cnblogs.com/"
    13 request = urllib.request.Request(rootUrl)
    14 
    15 header = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
    16 # 使用add_header设置请求头,将代码伪装成浏览器
    17 request.add_header("User-Agent", header)
    18 
    19 # 使用urllib.request.urlopen打开页面,使用read方法保存html代码
    20 htmlUrl = urllib.request.urlopen(request).read()
    21 
    22 # 使用BeautifulSoup创建html代码的BeautifulSoup实例,存为beautifulSoup
    23 beautifulSoup = BeautifulSoup(htmlUrl)
    24 
    25 # 获取尾页(对照前一小节获取尾页的内容看你就明白了)
    26 total_page = int(beautifulSoup.find("div",class_= "pager").findAll("a")[-2].get_text())
    27 
    28 list_item = beautifulSoup.findAll("a",class_="titlelnk")
    29 for i in list_item: # 遍历所有的内容
    30     href = i["href"] # 获取对应的href
    31     req = urllib.request.Request(href)
    32     req.add_header("User-Agent", header)
    33     html = urllib.request.urlopen(req).read()
    34     soup = BeautifulSoup(html)
    35     # 获取标题
    36     titleContent = soup.find("a", id="cb_post_title_url")
    37     if titleContent is not None: # 判读是否为空
    38         title = titleContent.get_text()   
    39         # 获取内容
    40         content = soup.find("div").get_text().strip()
    41         print(title, "
    =====================================
    ", content[1:100])

    爬虫结果:

     

  • 相关阅读:
    添加日志文件
    C库函数对文件的操作
    getpass的使用
    C语言实现 冒泡排序 选择排序 希尔排序
    批量Ping执行Bash脚本
    Script
    echo
    Centos7 pxe
    Rsync 参数
    Ip HostName查询
  • 原文地址:https://www.cnblogs.com/yszd/p/9974800.html
Copyright © 2020-2023  润新知