• 获取全部校园新闻


    1.取出一个新闻列表页的全部新闻 包装成函数。

    2.获取总的新闻篇数,算出新闻总页数。

    3.获取全部新闻列表页的全部新闻详情。

    4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

    1-3:

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    listPageUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
    page = 1
    headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 '
                             '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 获取新闻点击次数
    def getNewsId(url):
        newsId = re.findall(r'\_(.*).html', url)[0][-4:]
        clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
        clickRes = requests.get(clickUrl,headers)
        # 利用正则表达式获取新闻点击次数
        clickCount = int(re.search("hits').html('(.*)');", clickRes.text).group(1))
        return clickCount
    # 读取新闻细节
    def getNewDetail(detail,title,description):
        resDescript = requests.get(detail,headers)
        resDescript.encoding = "utf-8"
        soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
    
        content = soupDescript.select(".show-content")[0].text  # 正文
        info = soupDescript.select(".show-info")[0].text  # info相关内容
        # 第一种方法 分离 message = info.split()
        # 第二种方法 用正则表达式
        print('标题' + ': ' + title)
        print('概要' + ': ' + description)
        print('链接' + ': ' + detail)
        print('正文' + ' :' + content)
        if(re.search("发布时间:(.*) xa0xa0 xa0xa0作者:", info) !="Null" ):
            time = re.search("发布时间:(.*) xa0xa0 xa0xa0作者:", info).group(1)
        if (re.search("作者:(.*)xa0xa0审核:", info) !="Null"):
            author = re.search("作者:(.*)xa0xa0审核:", info).group(1)
            print("作者:" + author)
        if (re.search("审核:(.*)xa0xa0来源:", info) !="Null"):
            right = re.search("审核:(.*)xa0xa0来源:", info).group(1)
        if (re.search('来源:(.*)xa0xa0xa0xa0摄影:', info) != "null"):
            resource = re.search('来源:(.*)xa0xa0xa0xa0摄影:', info)
        if (re.search("摄影:(.*)xa0xa0xa0xa0点击:", info)!="Null"):
            video = re.search("摄影:(.*)xa0xa0xa0xa0点击:", info)
        count = getNewsId(detail)
        dateTime = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    
    
    
    
    #获取新闻
    def getListPage(listPageUrl,n):
        resListPage = requests.get(listPageUrl,headers)
        resListPage.encoding = "utf-8"
        soup = BeautifulSoup(resListPage.text, 'html.parser')
        PageUrl = listPageUrl + str(1) + ".html"
        if(n == 0):
           global page
           page=int(soup.select(".a1")[0].text[:-1])
           print(page)
        for s in soup.select("li"):
          if len(s.select(".news-list-title"))>0:
            title = s.select(".news-list-title")[0].text #新闻标题
            description = s.select(".news-list-description")[0].text #新闻描述
            detail = s.a.attrs["href"] #观看新闻细节
            getNewDetail(detail,title,description)
    
    getListPage(listPageUrl,0)
    print(page)
    for n in range(1,page):
        PageUrl = listPageUrl + str(n) + ".html"
        getListPage(PageUrl,n)
    # for n in range(1,soup.select(".a1")[0].text[:-1]):
     #       PageUrl = listPageUrl+str(n)+"html";      getListPage(PageUrl)
        '''
       '''

     4(爬慕课网):

    import requests,jieba
    from bs4 import BeautifulSoup
    import re
    #网站 慕课网
    url = "https://www.imooc.com/course/list"
    resDescript = requests.get(url)
    resDescript.encoding = "utf-8"
    soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
    #全局变量
    n =1
    m =1
    notelist = ""
    #错误词汇
    delete_word={'使','D','b','e','f','t','C','','o','','a','A','n','S','y','i','',''}
    #分析数据的方向、类型、类别
    for s in soupDescript.select(".course-nav-item.on"):
            if n == 1 : print("方向:" + s.text)
            elif n ==2:print("类型:" + s.text)
            elif n ==3:print("类别:"+s.text)
            n = n+1
    #分析数据的受欢迎类型、难度
    for q in soupDescript.select(".sort-item.active"):
         if m == 1 : print("受欢迎类型:"+q.text)
         else: print("难度:"+q.text)
         m=m+1
    
    #挖掘课程的相关信息
    for content in soupDescript.select(".course-card-container"):
        img = content.select(".course-banner.lazy")[0].attrs["src"][2:]
        name = content.select(".course-card-name")[0].text
        desc = content.select(".course-card-desc")[0].text
        notelist += desc
        notelist+=name
        print("图片链接:"+img)
        print("课程名字:"+name)
        print("简介:"+desc)
        print("
    ")
    
    NoteDic={}
    for i in set(notelist):    #计算次数
        NoteDic[i]=notelist.count(i)
    
    
    for i in delete_word:        #删除非法词汇
        if i in NoteDic:
            del NoteDic[i]
    #进行排序
    sort_word = sorted(NoteDic.items(), key = lambda d:d[1], reverse = True)  # 由大到小排序
    #输出效果
    print(sort_word)

  • 相关阅读:
    git相关整理
    cookie、sessionStorage和localStorage
    AJAX学习笔记
    json web token
    加密算法
    单点登陆
    给手机网络添加手动代理
    oracle数据库索引
    类加载器
    类加载过程
  • 原文地址:https://www.cnblogs.com/qazwsx833/p/8796308.html
Copyright © 2020-2023  润新知