• bs4抓取糗事百科


    抓取糗事百科内容及评论,不包含图片信息。user-agent填入浏览器的即可。user-agent对应的value,360极速浏览器的话,可以在地址栏输入about:version,回车,用户代理后面的一长串就是需要填入''里面的内容。其他的可以自行百度

    import urllib.request
    import re
    from urllib import request
    from bs4 import BeautifulSoup
    
    #1.获取网页源代码
    def get_html(url):
        headers = {
            'User-Agent': '',
        }
        req = request.Request(headers=headers,url=url)
        response = urllib.request.urlopen(req)
        content = response.read().decode('utf-8')
        return content
    
    #获取评论链接
    def get_comment_link(content,comment_url_base):
        soup = BeautifulSoup(content,'html.parser')
        articleFloor = 1
        for string in soup.find_all(attrs=re.compile(r"article block untagged mb15.*?")):
            comment = str(string.get('id')).strip().split("_")[2]
            comment_url = comment_url_base % comment#评论链接
            get_comment_content(comment_url,articleFloor)#获取评论内容
            articleFloor += 1
    
    #获取糗事内容及评论内容
    def get_comment_content(comment_url,articleFloor):
        commentPage = get_html(comment_url)
        commentFloor = 1
        soupComment = BeautifulSoup(commentPage,'html.parser')
        for item in soupComment.find_all('div',class_='content'):
            print(articleFloor,".",item.get_text().strip())#获取糗事内容
        for comment in soupComment.find_all(attrs="body"):
            print("      ",commentFloor,"楼回复:",comment.get_text())#获取评论内容
            commentFloor += 1
    
    def command():
        while True:
            raw = input("点击enter查看或者输入exit退出,请输入你的选择:")
            if raw=='enter':
                main()
                break
            else:
                break
    
    
    def main():
        article_url_base = 'https://www.qiushibaike.com/8hr/page/%d/'#文章地址
        comment_url_base = 'https://www.qiushibaike.com/article/%s'#评论地址
        article_url = article_url_base % 2
        content = get_html(article_url)
        get_comment_link(content,comment_url_base)
    
    if __name__ == '__main__':
        command()
    View Code
  • 相关阅读:
    Word Break
    Binary Tree Right Side View
    41. First Missing Positive
    2 Sum ,3 Sum, 3 Sum close
    216. Combination Sum III
    190. Reverse Bits
    143. Reorder List
    142. Linked List Cycle II
    Single Number i,ii,iii
    62. Unique Paths i & ii
  • 原文地址:https://www.cnblogs.com/smart-zihan/p/9615915.html
Copyright © 2020-2023  润新知