• 爬取新闻列表


    • 获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。
      import requests
      from bs4 import BeautifulSoup
      import re
      
      url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
      res = requests.get(url_main)
      res.encoding = 'utf-8'
      
      soup = BeautifulSoup(res.text,'html.parser')
      li = soup.select('li')
      
      def gethits(url_1):
          li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
          hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
          return hits
      
      def getpageinfo(label):
          for title_list in label:
              if len(title_list.select('.news-list-title'))>0:
                  href = title_list.select('a')[0]['href']
                  title = title_list.select('.news-list-title')[0].text
                  time = title_list.select('span')[0].text
                  info = title_list.select('span')[1].text
      
                  res_list = requests.get(href)
                  res_list.encoding = 'utf-8'
                  soup_list = BeautifulSoup(res_list.text,'html.parser')
                  text_list = soup_list.select('.show-content')[0].text
      
                  hits_list = gethits(href)
      
                  print('时间:',time,'
      标题:',title,'
      链接:',href,'
      来源:',info,'
      点击次数:',hits_list,'
      ')
                  print('正文:',text_list)
                  break
      
      getpageinfo(li)

    • 获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数。
    import requests
    from bs4 import BeautifulSoup
    import re
    
    url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url_main)
    res.encoding = 'utf-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    li = soup.select('li')
    
    def gethits(url_1):
        li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
        hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
        return hits
    
    def getpageinfo(label):
        for title_list in label:
            if len(title_list.select('.news-list-title'))>0:
                href = title_list.select('a')[0]['href']
                title = title_list.select('.news-list-title')[0].text
                time = title_list.select('span')[0].text
                info = title_list.select('span')[1].text
    
                res_list = requests.get(href)
                res_list.encoding = 'utf-8'
                soup_list = BeautifulSoup(res_list.text,'html.parser')
                text_list = soup_list.select('.show-content')[0].text
    
                hits_list = gethits(href)
    
                print('时间:',time,'
    标题:',title,'
    链接:',href,'
    来源:',info,'
    点击次数:',hits_list,'
    ')
                print('正文:',text_list)
                
    getpageinfo(li)

     

    • 获取所有新闻列表页的网址,调用上述函数
    import requests
    from bs4 import BeautifulSoup
    import re
    
    url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url_main)
    res.encoding = 'utf-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    li = soup.select('li')
    
    def gethits(url_1):
        li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
        hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
        return hits
    
    def getpageinfo(label):
        for title_list in label:
            if len(title_list.select('.news-list-title'))>0:
                href = title_list.select('a')[0]['href']
                title = title_list.select('.news-list-title')[0].text
                time = title_list.select('span')[0].text
                info = title_list.select('span')[1].text
    
                res_list = requests.get(href)
                res_list.encoding = 'utf-8'
                soup_list = BeautifulSoup(res_list.text,'html.parser')
                text_list = soup_list.select('.show-content')[0].text
    
                hits_list = gethits(href)
    
    
    getpageinfo(li)
    
    pages = int(soup.select('.a1')[0].text.rstrip(''))//10+1
    
    for i in range(2,pages+1):
        url_page = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
        
        res_page = requests.get(url_page)
        res_page.encoding = 'utf-8'
    
        soup_page = BeautifulSoup(res_page.text,'html.parser')
        list_page = soup.select('li')
    
        getpageinfo(list_page)
        print(url_page)

    • 完成所有校园新闻的爬取工作。
    import requests
    from bs4 import BeautifulSoup
    import re
    
    url_main="http://news.gzcc.cn/html/xiaoyuanxinwen/"
    res = requests.get(url_main)
    res.encoding = 'utf-8'
    
    soup = BeautifulSoup(res.text,'html.parser')
    li = soup.select('li')
    
    def gethits(url_1):
        li_id =re.search('_.*/(.*).html',url_1).groups(0)[0]
        hits = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(li_id)).text.split('.')[-1].rstrip('''');''').lstrip(''''html(''')
        return hits
    
    def getpageinfo(label):
        for title_list in label:
            if len(title_list.select('.news-list-title'))>0:
                href = title_list.select('a')[0]['href']
                title = title_list.select('.news-list-title')[0].text
                time = title_list.select('span')[0].text
                info = title_list.select('span')[1].text
    
                res_list = requests.get(href)
                res_list.encoding = 'utf-8'
                soup_list = BeautifulSoup(res_list.text,'html.parser')
                text_list = soup_list.select('.show-content')[0].text
    
                hits_list = gethits(href)
    
                print('时间:',time,'
    标题:',title,'
    链接:',href,'
    来源:',info,'
    点击次数:',hits_list,'
    ')
                print('正文:',text_list)
                #break
    
    getpageinfo(li)
    
    pages = int(soup.select('.a1')[0].text.rstrip(''))//10+1
    
    for i in range(2,pages+1):
        url_page = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
        
        res_page = requests.get(url_page)
        res_page.encoding = 'utf-8'
    
        soup_page = BeautifulSoup(res_page.text,'html.parser')
        list_page = soup.select('li')
    
        getpageinfo(list_page)
        #break

  • 相关阅读:
    springboot 集成jsp
    eclipse 创建 springboot项目
    eclipse 导入别人拷贝过来的工作空间项目
    vue安装及使用
    eclipse配置svn导出项目
    sql为什么用0,1表示男女?在sql语句里转好还是在页面转好?
    svn下载多模块及依赖框架的项目
    django连接sqlserver
    字符编码
    数组希尔排序法
  • 原文地址:https://www.cnblogs.com/zeson/p/7652866.html
Copyright © 2020-2023  润新知