• 数据结构化与保存


    1. 将新闻的正文内容保存到文本文件。

    def writeToDocument(filename, content):
        f = open(filename, 'a', encoding='utf-8')
        f.write(content)
        f.close()

    2. 将新闻数据结构化为字典的列表:

    (1)单条新闻的详情-->字典news

    def getNewsDetail(newsUrl):
        resdet = requests.get(newsUrl)
        resdet.encoding = 'utf-8'
        soupdet = BeautifulSoup(resdet.text, 'html.parser')
        news = {}
    
        news['title'] = soupdet.select('.show-title')[0].text
    
        if (soupdet.select('.show-info')):
            showinfo = soupdet.select('.show-info')[0].text
            date = showinfo.lstrip("发布时间:")[:19]
            news['dateTime'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    
            if (showinfo.find('作者') > 0):
                news['author'] = re.search('作者:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', showinfo).group(1)
                # newsdetail['author'] = re.search('作者:(.*)s*审|来|摄|点', showinfo).group(1)
            else:
                news['author'] = 'none'
    
            if (showinfo.find('审核') > 0):
                news['checker'] = re.search('审核:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', showinfo).group(1)
                # newsdetail['checker'] = re.search('审核:(.*)s*来|摄|点', showinfo).group(1)
            else:
                news['checker'] = 'none'
    
            if (showinfo.find('来源') > 0):
                news['source'] = re.search('来源:(.*)s*摄|点', showinfo).group(1)
            else:
                news['source'] = 'none'
    
            if (showinfo.find('摄影') > 0):
                news['photographer'] = re.search('摄影:((.{2,4}s|.{2,4}、|.{2,4},|w*s){1,5})', showinfo).group(1)
                # newsdetail['photographer'] = re.search('摄影:(.*)s*点', showinfo).group(1)
            else:
                news['photographer'] = 'none'
    
            news['clicktimes'] = getClickCount(newsUrl)
        else:
            return
    
        if (soupdet.select('.show-content')):
            news['contentdetail'] = soupdet.select('#content')[0].text
        else:
            return
    
        news['newsUrl'] = newsUrl
    
        # writeToDocument('gzccNews.txt', contentdetail)
    
        # print("发表时间:{0}  作者:{1}  审核:{2}  来源:{3}  摄像:{4}  点击次数:{5} 次".format(
        #     news['dateTime'], news['author'], news['checker'], news['source'], news['photographer'], news['clicktimes']))
        # print(newsdetail['contentdetail'])
        # print(newsdetail)
    
        return news

    (2)一个列表页所有单条新闻汇总-->列表newsls.append(news)

    def getListDetail(ListPageUrl):
        resl = requests.get(ListPageUrl)
        resl.encoding = 'utf-8'
        soupl = BeautifulSoup(resl.text, 'html.parser')
        gzccNewslist = {}
        newsls = []
        for news in soupl.select('li'):
            if len(news.select('.news-list-title')) > 0:
                gzccNewslist['title'] = news.select('.news-list-title')[0].text
                gzccNewslist['description'] = news.select('.news-list-description')[0].text
                gzccNewslist['info'] = news.select('.news-list-info')[0].text
                gzccNewslist['address'] = news.select('a')[0]['href']
    
                # print("
    标题: {0}
    描述: {1}
    信息: {2}
    链接: {3}".format(
                #     gzccNewslist['title'], gzccNewslist['description'], gzccNewslist['info'], gzccNewslist['address']))
    
                # print(list)
    
                newsls.append(getNewsDetail(gzccNewslist['address']))
        return newsls

    (3)所有列表页的所有新闻汇总列表newstotal.extend(newsls)

    locale.setlocale(locale.LC_CTYPE, 'chinese')
    newstotal = []
    Listurl = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
    pagecount = getPageNum(Listurl)
    for i in range(1, pagecount + 1):
        if (i == 1):
            ListPageUrl = Listurl
        else:
            ListPageUrl = Listurl + '{}.html'.format(i)
        newstotal.extend(getListDetail(ListPageUrl))
        break

    3. 安装pandas,用pandas.DataFrame(newstotal),创建一个DataFrame对象df.

    df = pandas.DataFrame(gzccNews)

    4. 通过df将提取的数据保存到csv或excel 文件。

    df.to_excel('gzccnews.xlsx')

    5. 用pandas提供的函数和方法进行数据分析:

    (1)提取包含点击次数、标题、来源的前6行数据

    df[['clicktimes', 'title', 'source']].head(6)
    print(df[['clicktimes', 'title', 'source']].head(6))

    (2)提取‘学校综合办’发布的,‘点击次数’超过3000的新闻。

    df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')]
    print(df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')])

    (3)提取'国际学院'和'学生工作处'发布的新闻。

    soulist = ['国际学院', '学生工作处']
    print(df[df['source'].isin(soulist)])
  • 相关阅读:
    poj 3322 不错的搜索题,想通了就很简单的。
    spoj 10649 镜子数的统计(正过来反过去一样)
    搜索第一题(poj 1190)蛋糕
    HashMap和Hashtable的区别
    ajax简单联动查询以及遇到的问题
    PHP之面向对象
    pg_bulkload快速加载数据
    WalMiner
    postgresWAL写放大优化
    postgresql创建统计信息优化
  • 原文地址:https://www.cnblogs.com/BennyKuang/p/8798049.html
Copyright © 2020-2023  润新知