• 一个完整的python大作业


    由于能选择一个感兴趣的网站进行数据分析,所以这次选择爬取的网站是新华网,其网址为"http://www.xinhuanet.com/",然后对其进行数据分析并生成词云

    运行整个程序相关的代码包

    import requests
    import re
    from bs4 import BeautifulSoup
    from datetime import datetime
    import pandas
    import sqlite3
    import jieba
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt

    爬取网页信息

    url = "http://www.xinhuanet.com/"
    
    f=open("css.txt","w+")
    res0 = requests.get(url)
    res0.encoding="utf-8"
    soup = BeautifulSoup(res0.text,"html.parser")
    newsgroup=[]
    for news in soup.select("li"):
        if len(news.select("a"))>0:
            print(news.select("a")[0].text)
            title=news.select("a")[0].text
            f.write(title)
    f.close()
    

    存入txt文件中,并进行字词统计

    f0 = open('css.txt','r')
    qz=[]
    qz=f0.read()
    f0.close()
    print(qz)
    
    words = list(jieba.cut(qz))
    
    ul={':','','"','','','','','','','',' ','u3000','','
    '}
    dic={}
    
    keys = set(words)-ul
    for i in keys:
        dic[i]=words.count(i)
    
    c = list(dic.items())
    c.sort(key=lambda x:x[1],reverse=True)
    
    f1 = open('diectory.txt','w')
    for i in range(10):
        print(c[i])
        for words_count in range(c[i][1]):
            f1.write(c[i][0]+' ')
    f1.close()

    存入数据库

    df = pandas.DataFrame(words)
    
    print(df.head())
    
    with sqlite3.connect('newsdb3.sqlite') as db:
    
        df.to_sql('newsdb3',con = db)

    制作词云

    f3 = open('diectory.txt','r')
    cy_file = f3.read()
    f3.close()
    cy = WordCloud().generate(cy_file)
    plt.imshow(cy)
    plt.axis("off")
    plt.show()

    最终成果

     完整代码

    import requests
    import re
    from bs4 import BeautifulSoup
    from datetime import datetime
    import pandas
    import sqlite3
    import jieba
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    
    
    url = "http://www.xinhuanet.com/"
    
        
    
    f=open("css.txt","w+")
    res0 = requests.get(url)
    res0.encoding="utf-8"
    soup = BeautifulSoup(res0.text,"html.parser")
    newsgroup=[]
    for news in soup.select("li"):
        if len(news.select("a"))>0:
            print(news.select("a")[0].text)
            title=news.select("a")[0].text
            f.write(title)
    f.close()
    
    f0 = open('css.txt','r')
    qz=[]
    qz=f0.read()
    f0.close()
    print(qz)
    
    words = list(jieba.cut(qz))
    
    ul={':','','"','','','','','','','',' ','u3000','','
    '}
    dic={}
    
    keys = set(words)-ul
    for i in keys:
        dic[i]=words.count(i)
    
    c = list(dic.items())
    c.sort(key=lambda x:x[1],reverse=True)
    
    f1 = open('diectory.txt','w')
    for i in range(10):
        print(c[i])
        for words_count in range(c[i][1]):
            f1.write(c[i][0]+' ')
    f1.close()
    
    df = pandas.DataFrame(words)
    
    print(df.head())
    
    with sqlite3.connect('newsdb3.sqlite') as db:
    
        df.to_sql('newsdb3',con = db)
    
    
    f3 = open('diectory.txt','r')
    cy_file = f3.read()
    f3.close()
    cy = WordCloud().generate(cy_file)
    plt.imshow(cy)
    plt.axis("off")
    plt.show()
  • 相关阅读:
    [Java优化] Java代码细节优化
    [网址] 工具类网址
    [Hadoop] 大数据环境安装博客
    简单说下Netty和RPC吧,大佬绕行
    nginx二级域名配置[CentOS]
    java面试准备-自我介绍
    同步||异步&&阻塞||非阻塞
    关于多线程的知识点-02
    [java进阶]关于多线程的知识点
    [java进阶]关于多线程的知识点
  • 原文地址:https://www.cnblogs.com/murasame/p/7769198.html
Copyright © 2020-2023  润新知