• Python3.7爬虫 大量爬取某小说网站小说并写入mysql(持续完善中...) 未解决问题:mysql长时间新增超过百万条数据表锁甚至崩溃


    练手之作 代码中还有很多问题 持续完善中 

    渣渣阿里T5 99包邮服务器只开了6个进程

    #encoding:utf-8
    import requests  # 请求
    from lxml import html  # 解析HTML
    from multiprocessing import Pool,Semaphore  # 进程
    import random
    import time
    import os
    import string
    from fake_useragent import UserAgent
    import multiprocessing
    import base64
    import MySQLdb
    
    basepath = os.path.abspath('text')
    imgpath = os.path.abspath('timg')
    baseUrl = 'http://www.quanshuwang.com/list/1_1.html'
    baseFrom = '全书网'
    type=2
    
    def getList(page):  # 获得主页数据
        r = requests.get('http://www.quanshuwang.com/all/allvisit_{}_0_0_0_0_0_{}.html'.format(type,page), headers=getHeaders()).text
        doc = html.fromstring(r)
        urls = doc.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/@href')
        return urls
    
    
    
    
    def getHeaders():  # 头部
        headers = {
            'Referer': baseUrl,
            'Connection': 'close',
            'User-Agent': UserAgent().random
        }
        return headers
    
    
    def upload_img(jpgLink, filename):
        with open(filename, "wb+") as jpg:
            jpg.write(requests.get(jpgLink).content)
            print('图片下载成功')
    
    
    def getInfo(url):
        try:
            info = {}
            pro = 1
            r = requests.get(url, headers=getHeaders(), timeout=3)
            doc = html.fromstring(r.content)
            des = doc.xpath('//div[@id="waa"]/text()')[0]
            info['des'] = "".join(des.split())
            info['des'] = info['des'][3:]
            info['name'] = doc.xpath('//div[@class="b-info"]/h1/text()')[0]
            links = doc.xpath('//div[@class="b-oper"]/a/@href')[0]
            imgurl = doc.xpath('//a[@class="l mr11"]/img/@src')[0]
            img = base64.b64encode(info['name'].encode('utf-8')) + b'.jpg'
            img=(img.decode()).replace('/', '')
            info['thumb'] = 'timg/' + img
            filename = imgpath + '/' + img
            info['from'] = links
            upload_img(imgurl, filename)  # 下载图片
            getBook(links, pro, info) #下载内容
        except requests.exceptions.Timeout:
            print('连接超时,正在重连...')
            getInfo(url)
        except Exception as e:
            print('错误',e)
            getInfo(url)
    
    
    def insertList(info):  # 新增小说
        db = MySQLdb.connect.connect(host='localhost', user='root', passwd='LuoYang%684985', db='python', port=3306,
                             charset='utf8')
        you = db.cursor()  # 用cursor方法获取一个操作游标you
        sql='select id from text_list  where name={}'.format("'"+info['name']+"'")
        you.execute(sql)
        is_repeat =you.fetchone()
        if is_repeat:
            print('小说{}重复'.format(info['name']))
            return is_repeat[0]
        else:
            you.execute("insert into text_list (type,thumb,description,name,author,froms,add_time) value({},{},{},{},{},{},{})".format(info['type'],"'"+info['thumb']+"'","'"+info['des']+"'","'"+info['name']+"'","'"+info['author']+"'", "'"+info['from']+"'",int(time.time())))
            you.execute("select last_insert_id();")
            data = you.fetchone()
            db.commit()
            db.close()  # 释放数据库资源
            print('正在下载小说{}'.format(info['name']))
            return data[0]
    
    def is_repeat(info,db):
        you = db.cursor()  # 用cursor方法获取一个操作游标you
        sql1 = 'select id from text_del where l_id={} and title={}'.format(info['l_id'], "'" + info['title'] + "'")
        you.execute(sql1)
        is_repeat = you.fetchone()
        if is_repeat:
            time.sleep(0.1)
            return -1
        else:
            return 1
    def insertContent(info,db):  # 新增小说
        you = db.cursor()  # 用cursor方法获取一个操作游标you
        sql="insert into text_del (l_id,title,content,add_time,`order`,froms) value({},{},{},{},{},{})".format(info['l_id'],"'"+info['title']+"'","'"+info['content']+"'",info['add_time'],info['num'],"'"+info['froms']+"'")
        you.execute(sql)
        db.commit()
        you.close()  # 关闭操作游标
    
    
    def random_string(size=5, chars=string.ascii_uppercase + string.digits):
            return str(int(time.time()))+''.join(random.choice(chars) for _ in range(size))
    def getBook(link, pro, info):  # 下载图片以及存入mysql
        try:
            r = requests.get(link, headers=getHeaders(),timeout=3)
            doc = html.fromstring(r.content)
            info['author'] = doc.xpath('//div[@class="chapName"]/span/text()')[0]
            info['author'] = info['author'][3:]
            info['type'] = type
            res={}
            res['l_id']=insertList(info)
            links = doc.xpath('//div[@class="clearfix dirconone"]/li/a')
            str=random_string()
            bookpath = '%s\%s' % (basepath,str )
            if os.path.exists(bookpath):
                pass
            else:
                os.mkdir(bookpath)
            num=0
            db = MySQLdb.connect(host='localhost', user='root', passwd='LuoYang%684985', db='python', port=3306,charset='utf8')
            for i in links:
                num = num + 1
                res['num'] = num
                name = i.xpath('./text()')[0]
                res['title'] = name
                if is_repeat(res,db)==-1:
                    pass
                else:
                    downTxt(i, str, pro,res,db)
            db.close()
        except requests.exceptions.Timeout:
            print('代理连接超时,正在重连...')
            getBook(link, 0, info)
        except Exception as e:
            # print('错误', e)
            getBook(link, 0, info)
    
    
    def downTxt(page, path, pro,res,db):  # 下载书籍
        res['add_time']=int(time.time())
        url = page.xpath('./@href')[0]
        try:
            r = requests.get(url, headers=getHeaders())
            doc = html.fromstring(r.content)
            arc = doc.xpath('//div[@id="content"]/text()')
            arc = "".join(arc)
            str=random_string()
            relname='text/{}/{}.txt'.format(path,str)
            res['froms']=url
            res['content']=relname
            fileName = '%s\text\%s\%s.txt' % (os.path.abspath('.'),path, str)
            time.sleep(1)
            insertContent(res,db)
            with open(fileName, 'w+', encoding='utf-8') as txt:
                txt.write(arc)
        except requests.exceptions.Timeout:
            # print('代理连接超时,正在重连...')
            downTxt(page, path, 0,res,db)
        except Exception as e:
            # print('错误', e, '正在重连...')
            downTxt(page, path, 0,res,db)
    
    
    def work(i):
        lists = getList(i)
        for s in lists:
            getInfo(s)
            time.sleep(10)
    if __name__ == '__main__':
        multiprocessing.freeze_support()
        pool = multiprocessing.Pool(processes=6)
        for i in range(1,51):
            pool.apply_async(work, (i, ))
        pool.close()
        pool.join()
    

      

  • 相关阅读:
    Self Numbers
    【acdream】小晴天老师系列——竖式乘法
    全错位排列
    2 ^ x mod n = 1问题
    基于cocos2dx的横版动作游戏制作(二)
    基于cocos2dx的横版动作游戏制作(一)
    横版游戏制作之英雄技能CD遮罩,人物头像血条属性
    cocos2d横版游戏之摇杆控制
    C++ delete []p 数组指针,如何知道该数组大小的
    do { ....} while(0) 在宏里冗余的意义
  • 原文地址:https://www.cnblogs.com/qsAnunnaki/p/10354113.html
Copyright © 2020-2023  润新知