• 爬虫大作业


    import requests
    import re
    from bs4 import BeautifulSoup
    import jieba.analyse
    from PIL import Image, ImageSequence
    import numpy as np
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud, ImageColorGenerator
    
    # 获取总页数
    def getnum(url):
        res = requests.get(url)
        res.encoding = 'gb2312'
        soup = BeautifulSoup(res.text, 'html.parser')
        Info = soup.select(".page-next")[0].extract().text
        TotalNum = re.search("共(d+)页.*",Info).group(1)
        return TotalNum
    
    #获取单个页面所有链接
    def getpageurl(url):
        res = requests.get(url)
        res.encoding = 'gb2312'
        soup = BeautifulSoup(res.text, 'html.parser')
        a = soup.select(".list-page ul")
        for i in soup.select(".list-page ul li"):
            if len(i.select("a"))>0:
                info = i.select("a")[0].attrs['href']
                pageurl = 'http://www.ckck.tv/' + info
                print(pageurl)
                getinfromation(pageurl)
    # 获取页面的信息
    def getinfromation(url):
        res = requests.get(url)
        res.encoding = 'gb2312'
        soup = BeautifulSoup(res.text, 'html.parser')
        a = soup.select(".content .movie ul h1")[0].text
        print("电影:",a)
        b = soup.select(".content .movie ul li")[1].text
        name = re.search("【主 演】:(.*)",b).group(1)
        print("主演:",name)
        c = soup.select(".content .movie ul li")[4].text
        date = re.search("【年 代】:(.*)  【地 区】:", c).group(1)
        print("年代:", date)
        diqu = re.search("【地 区】:(.*)", c).group(1)
        print("地区:",diqu)
        # 将标签内容写入文件
        f = open('gzccNews.txt', 'a', encoding='utf-8')
        f.write(a )
        f.write(name )
        f.write(date )
        f.write(diqu)
        f.write("
    ")
        f.close()
    
    # 生成词云
    def getpicture():
        lyric = ''
        f = open('gzccNews.txt', 'r', encoding='utf-8')
        for i in f:
            lyric += f.read()
    
        result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
        keywords = dict()
        for i in result:
            keywords[i[0]] = i[1]
        print(keywords)
    
        image = Image.open('input.jpg')
        graph = np.array(image)
        wc = WordCloud(font_path='./fonts/simhei.ttf', background_color='White', max_words=50, mask=graph)
        wc.generate_from_frequencies(keywords)
        image_color = ImageColorGenerator(graph)
        plt.imshow(wc)
        plt.imshow(wc.recolor(color_func=image_color))
        plt.axis("off")
        plt.show()
        wc.to_file('output.png')
    
    
    url = 'http://www.ckck.tv/xj/Index.html'
    a = getnum(url)
    getpageurl(url)
    for i in range(2,int(a)):
         page = 'http://www.ckck.tv/xj/List_4_{}.html'.format(i)
         getpageurl(page)
    getpicture()

    首先定义获取总页面、获取页面所有链接、获取页面信息、生成词云等的函数,过程中就是获取所有页面所有链接出现点问题,归结于找标签问题。这次爬取的是一个电影网站,将网站里面的电影名、主演、年代、地区,然后进行词云生成

  • 相关阅读:
    使用XMLReader读XML
    C#命名空间大全详细教程
    C# using 三种使用方式
    SVN服务器搭建
    简单的自定义Session
    python学习第十八天 --错误&异常处理
    锁的等级:方法锁、实例锁、类锁
    java线程池如何合理的设置大小
    挖掘两个Integer对象的swap的内幕
    实现线程同步的方式
  • 原文地址:https://www.cnblogs.com/qq974975766/p/8909008.html
Copyright © 2020-2023  润新知