• 团队项目冲刺第八天


    按照标签名字爬取新闻

    复制代码
    复制代码
    from bs4 import BeautifulSoup        #网页解析,获悉数据.231
    import re                            #正则表达式
    import urllib.request,urllib.error   #制定URL,获取网页数据
    import pymysql
    import traceback
    import time
    import requests
    import json
    #得到制定一个URL的网页内容
    def askUrl(url):
        head={
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
        if __name__ == '__main__':
            request = urllib.request.Request(url, headers=head)
            html = ""
            try:
                response = urllib.request.urlopen(request)
                html = response.read().decode("utf-8")
            except urllib.error.URLError as e:
                if hasattr(e,"code"):
                    print(e.code)
                if hasattr(e,"reason"):
                    print(e.reasen)
            return html
    #链接数据库
    def get_conn():
        conn = pymysql.connect(
            host="localhost",
            user="root",
            passwd="qwer1234",
            db="news",
            charset="utf8mb4"
        )
        cursor = conn.cursor()
        return conn, cursor
    #关闭数据库
    def close_conn(conn, cursor):
        if cursor:
            cursor.close()
        if conn:
            conn.close()
    #更新新闻数据
    def update_news(allinfo):
        cursor = None
        conn = None
        try:
            conn, cursor = get_conn()
            sql = "insert into new(title, article, fenlei) values(%s,%s,%s)"
            print(f"{time.asctime()}开始更新最新数据")
            for item in allinfo:
                cursor.execute(sql, item)
            conn.commit()
            print(f"{time.asctime()}更新最新数据完毕")
        except:
            traceback.print_exc()
        finally:
            close_conn(conn, cursor)
    
    #插入新闻数据
    # def insert_news():
    #     cursor = None
    #     conn = None
    #     try:
    #         dic = getdata()[0]  # 0是历史数据字典,1是最新详细数据列表
    #         print(f"{time.asctime()}开始插入历史数据")
    #         conn, cursor = get_conn()
    #         sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    #         for k, v in dic.items():
    #             cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"),
    #                                  v.get("suspect"), v.get("suspect_add"),
    #                                  v.get("heal"), v.get("heal_add"),
    #                                  v.get("dead"), v.get("dead_add"),
    #                                  v.get("confirm"),
    #                                  ])
    #             conn.commit()
    #             print(f"{time.asctime()}插入历史数据完毕")
    #     except:
    #         traceback.print_exc()
    #     finally:
    #         close_conn(conn, cursor)
    #爬取网页信息
    def get_info(baseurl):
        html = askUrl(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        return bs
    #soup处理并转换成字符串
    def transport(bs, info):
        ex_info = bs.find_all(class_=info)
        info = str(ex_info)
        return ex_info, info
    if __name__=="__main__":
        baseurl = "https://news.sohu.com/"
        html = askUrl(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        ex_info = bs.find_all(class_="head-nav left")
        info = str(ex_info)
        findLink = re.compile(r'<a href="(.*?)">')
        link = re.findall(findLink, info)
        del link[0]
        del link[0]
        del link[9]
        #新闻分类总个数
        # for item in link:
        #     print(item)
    #*************************************************************************************
    '''
    ******************
        体育类新闻
    ******************
    '''
    #第一块上
    bs = get_info(link[0])
    ex_info, info = transport(bs, "theme__color__hover")
    print("************************************************************")
    findinfo = re.compile(r'<a class="theme__color__hover" href="(.*?)" target="_blank">')
    link0 = re.findall(findinfo, info)
    # print(len(link0))
    # for item in link0:
    #     print(item)
    print("************************************************************")
    #第一块下
    ex_info, info = transport(bs, "s-one_center")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link1 = re.findall(findinfo, info)
    # print(len(link1))
    # for item in link1:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block-list-item")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link2 = re.findall(findinfo, info)
    # print(len(link2))
    # for item in link2:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block-list-item z-c-block-list-item-first")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link3 = re.findall(findinfo, info)
    # print(len(link3))
    # for item in link3:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block-list clear")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link4 = re.findall(findinfo, info)
    # print(len(link4))
    # for item in link4:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-c-block")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link5 = re.findall(findinfo, info)
    # print(len(link5))
    # for item in link5:
    #     print(item)
    # print("************************************************************")
    ex_info, info = transport(bs, "z-head-news_item")
    findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
    link6 = re.findall(findinfo, info)
    # print(len(link5))
    # for item in link5:
    #     print(item)
    # print("************************************************************")
    #所有链接相加
    linkall = link0+link1+link2+link3+link4+link5+link6
    #去除非文本类新闻
    i =0
    j =0
    h =0
    for index, value in enumerate(linkall):
        if len(value) < 73:
            del linkall[index]
    for index, value in enumerate(linkall):
        if len(value) < 73:
            del linkall[index]
    for index, value in enumerate(linkall):
        if len(value) < 73:
            del linkall[index]
    # print(len(linkall))
    #测试
    # for item in linkall:
    #     print(item)
    #去除重复链接
    for index, value in enumerate(linkall):
        for index1, value1 in enumerate(linkall):
            if value == value1 and index != index1:
                del linkall[index1]
    for item in linkall:
        print(item)
    print(len(linkall))
    # print("************************************************************")
    allTitle = []
    allArticle = []
    allImg = []
    #去除空页
    for index, value in enumerate(linkall):
        bs = get_info(value)
        title = bs.select("h1")
        if title:
            #总标题表添加标题
            continue
        else:
            print(index)
            print(value)
            del linkall[index]
    #爬取数据
    for index, value in enumerate(linkall):
        bs = get_info(value)
        title = bs.select("h1")
        if title:
            a = []
            str = ''
            #总标题表添加标题
            allTitle.append(title[0].get_text().strip().replace("原创", "").replace("
    ", ""))
            print(index)
            print(value)
            print(title[0].get_text().strip().replace("原创", ""))
            #总文章表添加文章
            article = bs.select("article > p")
            for item in range(1, len(article)):
               str += article[item].get_text()
            # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"
    ", "")
            allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
            #总图片表添加图片
            # ex_info, info = transport(bs, "ql-align-center")
            # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
            # Img = re.findall(findImg, info)
            # if Img:
            #     allImg.append(Img)
            # else:
            #     allImg.append("")
        else:
            print(index)
            print(value)
            del linkall[index]
    #测试
    print(len(linkall))
    print(len(allTitle))
    print(len(allArticle))
    # print(len(allImg))
    #插入mysql体育新闻数据
    allinfo = []
    for index, value in enumerate(allTitle):
        allinfo.append([value])
        allinfo[index].append(allArticle[index])
        allinfo[index].append('体育')
    # for item in allinfo:
    #     print(item)
    update_news(allinfo)
    '''
    
    爬取旅游类新闻
    
    '''
    head = {
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
    #存放所有的新闻网址
    linkall = []
    #所有存放新闻的.js文件
    linkJQ = []
    #所有超链接id
    Linkid = []
    #所有超链接Authorid
    LinkAid = []
    #存放所有标题
    allTitle = []
    #存放所有文章
    allArticle = []
    #存放所有图片链接
    allImg = []
    #汇总所有存入mysql的数据
    allinfo = []
    #制作每个js网页的链接
    for i in range(1,10):
        linkJQ.append('https://cis.sohu.com/cis/feeds?callback=jQuery112404940224114573859_1619226100800&clientType=3&suv=2011032041009993&pvId=1619226100991dZepSty&sceneParam=%5B%7B%22page%22%3A'+str(i)+'%2C%22size%22%3A24%2C%22spm%22%3A%22smpc.travel-home.feed%22%7D%5D&refererSpm=smpc.travel-home.feed&refererPath=%2F')
        res = requests.get(linkJQ[i-1], headers=head)
        response_data = json.loads(res.text.replace('jQuery112404940224114573859_1619226100800(', '')[:-1])
    #存入每个新闻的id和authorid
        for index, value in enumerate(response_data['smpc.travel-home.feed']['data']):
            if int(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id']) > 1000000:
                Linkid.append(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id'])
                LinkAid.append(str(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['contentData']['authorId']))
    
    #制作旅游新闻所有网址
    for index,value in enumerate(Linkid):
        linkall.append('https://www.sohu.com/a/'+str(Linkid[index])+'_'+str(LinkAid[index])+'?scm=1004.768163804164063232.0.0.4162&spm=smpc.travel-home.feed.5.1619267001122I92VC4c')
    #最后一个链接是广告,删除
    linkall.pop()
    #开始爬取主要数据
    for index, value in enumerate(linkall):
        bs = get_info(value)
        title = bs.select("h1")
        article = bs.select("article > p")
        if title and article:
            str = ''
            # 总文章表添加文章
            for item in range(1, len(article)):
                str += article[item].get_text()
            if len(str) * 4 > 16000:
                print("超出可储存长度")
                del linkall[index]
                continue
            # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"
    ", "")
            allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
            # 总标题表添加标题
            allTitle.append(title[0].get_text().strip().replace("原创", "").replace("
    ", ""))
            print(index)
            print(value)
            print(title[0].get_text().strip().replace("原创", ""))
            # 总图片表添加图片
            # ex_info, info = transport(bs, "ql-align-center")
            # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
            # Img = re.findall(findImg, info)
            # if Img:
            #     allImg.append(Img)
            # else:
            #     allImg.append("")
        else:
            print(index)
            print(value)
            del linkall[index]
    # for item in linkall:
    #     allinfo.append([item])
    for index, value in enumerate(allTitle):
        allinfo.append([value])
        allinfo[index].append(allArticle[index])
        allinfo[index].append('旅游')
    for item in allinfo:
        print(item)
    update_news(allinfo)
  • 相关阅读:
    Leetcode Reverse Words in a String
    topcoder SRM 619 DIV2 GoodCompanyDivTwo
    topcoder SRM 618 DIV2 MovingRooksDiv2
    topcoder SRM 618 DIV2 WritingWords
    topcoder SRM 618 DIV2 LongWordsDiv2
    Zepto Code Rush 2014 A. Feed with Candy
    Zepto Code Rush 2014 B
    Codeforces Round #245 (Div. 2) B
    Codeforces Round #245 (Div. 2) A
    Codeforces Round #247 (Div. 2) B
  • 原文地址:https://www.cnblogs.com/wang2232985989/p/14908659.html
Copyright © 2020-2023  润新知