• 爬去搜狐新闻体育类


    #-*-coding:utf-8-*-
    # @Time :2021/4/17 14:58
    # @Author:shuaichao
    # @File :.py
    # @Software: PyCharm
    
    from bs4 import BeautifulSoup        #网页解析,获悉数据.231
    import re                            #正则表达式
    import urllib.request,urllib.error   #制定URL,获取网页数据
    import pymysql
    import traceback
    import time
    import requests
    import json
    #得到制定一个URL的网页内容
    def askUrl(url):
        head={
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
        if __name__ == '__main__':
            request = urllib.request.Request(url, headers=head)
            html = ""
            try:
                response = urllib.request.urlopen(request)
                html = response.read().decode("utf-8")
            except urllib.error.URLError as e:
                if hasattr(e,"code"):
                    print(e.code)
                if hasattr(e,"reason"):
                    print(e.reasen)
            return html
    #链接数据库
    def get_conn():
        conn = pymysql.connect(
            host="localhost",
            user="root",
            passwd="",
            db="news",
            charset="utf8mb4"
        )
        cursor = conn.cursor()
        return conn, cursor
    #关闭数据库
    def close_conn(conn, cursor):
        if cursor:
            cursor.close()
        if conn:
            conn.close()
    #更新新闻数据
    def update_news(allinfo):
        cursor = None
        conn = None
        try:
            conn, cursor = get_conn()
            sql = "insert into new(title, article, fenlei) values(%s,%s,%s)"
            print(f"{time.asctime()}开始更新最新数据")
            for item in allinfo:
                cursor.execute(sql, item)
            conn.commit()
            print(f"{time.asctime()}更新最新数据完毕")
        except:
            traceback.print_exc()
        finally:
            close_conn(conn, cursor)
    
    #插入新闻数据
    # def insert_news():
    #     cursor = None
    #     conn = None
    #     try:
    #         dic = getdata()[0]  # 0是历史数据字典,1是最新详细数据列表
    #         print(f"{time.asctime()}开始插入历史数据")
    #         conn, cursor = get_conn()
    #         sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    #         for k, v in dic.items():
    #             cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"),
    #                                  v.get("suspect"), v.get("suspect_add"),
    #                                  v.get("heal"), v.get("heal_add"),
    #                                  v.get("dead"), v.get("dead_add"),
    #                                  v.get("confirm"),
    #                                  ])
    #             conn.commit()
    #             print(f"{time.asctime()}插入历史数据完毕")
    #     except:
    #         traceback.print_exc()
    #     finally:
    #         close_conn(conn, cursor)
    #爬取网页信息
    def get_info(baseurl):
        html = askUrl(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        return bs
    #soup处理并转换成字符串
    def transport(bs, info):
        ex_info = bs.find_all(class_=info)
        info = str(ex_info)
        return ex_info, info
    if __name__=="__main__":
        baseurl = "https://news.sohu.com/"
        html = askUrl(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        ex_info = bs.find_all(class_="head-nav left")
        info = str(ex_info)
        findLink = re.compile(r'<a href="(.*?)">')
        link = re.findall(findLink, info)
        del link[0]
        del link[0]
        del link[9]
        #新闻分类总个数
        # for item in link:
        #     print(item)
    #*************************************************************************************
    '''
    ******************
        体育类新闻
    ******************
    '''
    #第一块上
    bs = get_info(link[0])
    ex_info, info = transport(bs, "theme__color__hover")
    print("************************************************************")
    findinfo = re.compile(r'<a class="theme__color__hover" href="(.*?)" target="_blank">')
    link0 = re.findall(findinfo, info)
    # print(len(link0))
    # for item in link0:
    #     print(item)
    print("************************************************************")
    #第一块下
    ex_info, info = transport(bs, "s-one_center")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link1 = re.findall(findinfo, info)
    # print(len(link1))
    # for item in link1:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block-list-item")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link2 = re.findall(findinfo, info)
    # print(len(link2))
    # for item in link2:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block-list-item z-c-block-list-item-first")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link3 = re.findall(findinfo, info)
    # print(len(link3))
    # for item in link3:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block-list clear")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link4 = re.findall(findinfo, info)
    # print(len(link4))
    # for item in link4:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link5 = re.findall(findinfo, info)
    # print(len(link5))
    # for item in link5:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-head-news_item")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link6 = re.findall(findinfo, info)
    # print(len(link5))
    # for item in link5:
    #     print(item)
    # print("************************************************************")
    #所有链接相加
    linkall = link0+link1+link2+link3+link4+link5+link6
    #去除非文本类新闻
    i =0
    j =0
    h =0
    for index, value in enumerate(linkall):
        if len(value) < 73:
            del linkall[index]
    for index, value in enumerate(linkall):
        if len(value) < 73:
            del linkall[index]
    for index, value in enumerate(linkall):
        if len(value) < 73:
            del linkall[index]
    # print(len(linkall))
    #测试
    # for item in linkall:
    #     print(item)
    #去除重复链接
    for index, value in enumerate(linkall):
        for index1, value1 in enumerate(linkall):
            if value == value1 and index != index1:
                del linkall[index1]
    for item in linkall:
        print(item)
    print(len(linkall))
    # print("************************************************************")
    allTitle = []
    allArticle = []
    allImg = []
    #去除空页
    for index, value in enumerate(linkall):
        bs = get_info(value)
        title = bs.select("h1")
        if title:
            #总标题表添加标题
            continue
        else:
            print(index)
            print(value)
            del linkall[index]
    #爬取数据
    for index, value in enumerate(linkall):
        bs = get_info(value)
        title = bs.select("h1")
        if title:
            a = []
            str = ''
            #总标题表添加标题
            allTitle.append(title[0].get_text().strip().replace("原创", "").replace("
    ", ""))
            print(index)
            print(value)
            print(title[0].get_text().strip().replace("原创", ""))
            #总文章表添加文章
            article = bs.select("article > p")
            for item in range(1, len(article)):
               str += article[item].get_text()
            # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"
    ", "")
            allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
            #总图片表添加图片
            # ex_info, info = transport(bs, "ql-align-center")
            # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
            # Img = re.findall(findImg, info)
            # if Img:
            #     allImg.append(Img)
            # else:
            #     allImg.append("")
        else:
            print(index)
            print(value)
            del linkall[index]
    #测试
    print(len(linkall))
    print(len(allTitle))
    print(len(allArticle))
    # print(len(allImg))
    #插入mysql体育新闻数据
    allinfo = []
    for index, value in enumerate(allTitle):
        allinfo.append([value])
        allinfo[index].append(allArticle[index])
        allinfo[index].append('体育')
    # for item in allinfo:
    #     print(item)
    update_news(allinfo)
  • 相关阅读:
    验证码的编写 asp.net
    甲骨文收购Sun,IT业界进入三国时代
    动态加载css文件导致IE8崩溃的问题
    页面调试中关于Console应该注意的地方
    关于仿网易邮箱5.0的Neter UI框架的开源声明
    仿网易邮箱5.0(二):core.js
    仿网易邮箱5.0(三):panel.js
    仿网易邮箱5.0(一):页面基本样式
    Windows下配置Sass编译环境
    ASP+Access查询时按时间进行查询
  • 原文地址:https://www.cnblogs.com/chaogehahaha/p/14843745.html
Copyright © 2020-2023  润新知