• 蜘蛛页面 获取一个网站的全部url 乐观代码


    蜘蛛页面

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    import pymysql
    
    h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test'
    
    
    def mysql_fetch(sql, res_type='tuple'):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return ()
        if res_type == 'dic':
            cursor = conn.cursor(pymysql.cursors.DictCursor)
        else:
    
            cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return cursor.fetchall()
    
    
    def mysql_write(sql):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return 1
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return 0
    
    
    browser = webdriver.Chrome()
    f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
    f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
    browser.get(f_url_l_a)
    time.sleep(random.randint(1, 2))
    url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
    js = 'window.location.href="{}";'.format(url)
    browser.execute_script(js)
    # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
        url_l = [i.attrs['href'] for i in bs.find_all('a')]
    res_l = []
    sql_l = []
    for i in url_l:
        if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
            # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
            # r = mysql_fetch(sql_chk)
            # if len(r) > 0:
            #     continue
            if i not in res_l:
                if i == url:
                    continue
                res_l.append(i)
                s = '("{}","{}","{}")'.format(browser.title, url, i)
                sql_l.append(s)
    if len(sql_l) > 0:
    
        sql = '{}{}'.format(sql, ','.join(sql_l))
        print(sql)
        mysql_write(sql)
    
    
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  min_id FROM ( SELECT  MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
    
    while True:
        sql_ori = 'SELECT page_url,children_url FROM  parent_url WHERE if_spider=0 ORDER BY  id DESC '
        res = mysql_fetch(sql_ori, 'dic')
        for d in res:
            page_url, children_url = d['page_url'], d['children_url']
            url = children_url
            js = 'window.location.href="{}";'.format(url)
            browser.execute_script(js)
            time.sleep(1)
            browser.refresh()
            myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
            with open(myhtml, 'w', encoding='utf-8') as fw:
                fw.write(browser.page_source)
            sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
            with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
                bs = BeautifulSoup(myhtml_o, 'html.parser')
                try:
                    url_l = [i.attrs['href'] for i in bs.find_all('a')]
                except Exception as e:
                    print(e)
                    continue
            res_l = []
            sql_l = []
            for i in url_l:
                # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
                if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
                    # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
                    # print(sql_chk)
                    # r = mysql_fetch(sql_chk)
                    # print(r)
                    # if len(r) > 0:
                    #     continue
                    if i not in res_l:
                        if i == url:
                            continue
                        res_l.append(i)
                        s = '("{}","{}","{}")'.format(browser.title, url, i)
                        # sql_break = '{}{}'.format(sql, s)
                        # print(sql_break)
                        # mysql_write(sql_break)
    
                       # print(s)
                        sql_l.append(s)
            if len(sql_l) > 0:
                sql = '{}{}'.format(sql, ','.join(sql_l))
                print(sql)
                mysql_write(sql)
    
                sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  min_id FROM ( SELECT  MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
                print(sql_del)
                mysql_write(sql_del)
    
                sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
                                                                                                               url)
                mysql_write(sql_udp)
                print(sql_udp)
        time.sleep(3)
    
    dd = 0

    CREATE TABLE `parent_url` (
    `id` int(11) NOT NULL AUTO_INCREMENT,
    `page_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    `page_url` text,
    `children_url` text,
    `if_spider` tinyint(4) DEFAULT '0',
    PRIMARY KEY (`id`)
    ) ENGINE=MyISAM AUTO_INCREMENT=5328 DEFAULT CHARSET=latin1;

    先写入,后删除

    避免每个写入前的检查

    消耗时间

    获取一个网站的全部url

    修复逻辑错误 

    支持 多进程   脚本多开

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    import pymysql
    
    h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test'
    
    
    def mysql_fetch(sql, res_type='tuple'):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return ()
        if res_type == 'dic':
            cursor = conn.cursor(pymysql.cursors.DictCursor)
        else:
    
            cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return cursor.fetchall()
    
    
    def mysql_write(sql):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return 1
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return 0
    
    
    browser = webdriver.Chrome()
    f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
    f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
    browser.get(f_url_l_a)
    time.sleep(random.randint(1, 2))
    url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
    js = 'window.location.href="{}";'.format(url)
    browser.execute_script(js)
    # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
        url_l = [i.attrs['href'] for i in bs.find_all('a')]
    res_l = []
    sql_l = []
    for i in url_l:
        if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
            # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
            # r = mysql_fetch(sql_chk)
            # if len(r) > 0:
            #     continue
            if i not in res_l:
                if i == url:
                    continue
                res_l.append(i)
                s = '("{}","{}","{}")'.format(browser.title, url, i)
                sql_l.append(s)
    if len(sql_l) > 0:
        sql = '{}{}'.format(sql, ','.join(sql_l))
        print(sql)
        mysql_write(sql)
    
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
    import random
    
    # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
    # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
    # https://so.gushiwen.org/app/
    url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/']
    while True:
    
        sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE   INSTR(page_title,'密码')>0 OR  INSTR(UPPER(page_url),'PWD')>0 OR  INSTR(UPPER(children_url),'PWD')>0) AS t);"
        mysql_write(sql_filter)
    
        sql_ori = 'SELECT page_url,children_url FROM  parent_url WHERE if_spider=0'
        res = mysql_fetch(sql_ori, 'dic')
        jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
        for d in res:
            jump_c += 1
            if jump_c < jump_s:
                continue
            page_url, children_url = d['page_url'], d['children_url']
            url = children_url
    
            continue_ = False
            for fl in url_kw_filter_l:
                if fl in url:
                    continue_ = True
                    break
            if continue_:
                continue
    
            js = 'window.location.href="{}";'.format(url)
            browser.execute_script(js)
            time.sleep(1)
            browser.refresh()
            myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
            with open(myhtml, 'w', encoding='utf-8') as fw:
                fw.write(browser.page_source)
            sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
            with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
                bs = BeautifulSoup(myhtml_o, 'html.parser')
                try:
                    url_l = [i.attrs['href'] for i in bs.find_all('a')]
                except Exception as e:
                    print(e)
                    continue
            res_l = []
            sql_l = []
            for i in url_l:
                # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
                if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
                    # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
                    # print(sql_chk)
                    # r = mysql_fetch(sql_chk)
                    # print(r)
                    # if len(r) > 0:
                    #     continue
                    if i not in res_l:
                        if i == url:
                            continue
                        res_l.append(i)
                        s = '("{}","{}","{}")'.format(browser.title, url, i)
                        # sql_break = '{}{}'.format(sql, s)
                        # print(sql_break)
                        # mysql_write(sql_break)
    
                        # print(s)
                        sql_l.append(s)
            if len(sql_l) > 0:
                sql = '{}{}'.format(sql, ','.join(sql_l))
                print(sql)
                mysql_write(sql)
                sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
                print(sql_del)
                mysql_write(sql_del)
                sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
                                                                                                               url)
                mysql_write(sql_udp)
                print(sql_udp)
    time.sleep(3)
    
    dd = 0
    

      

    代码的每一个功能点的模块化 

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    import pymysql
    
    h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test'
    
    
    def mysql_fetch(sql, res_type='tuple'):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return ()
        if res_type == 'dic':
            cursor = conn.cursor(pymysql.cursors.DictCursor)
        else:
    
            cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return cursor.fetchall()
    
    
    def mysql_write(sql):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return 1
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return 0
    
    
    browser = webdriver.Chrome()
    f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
    f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
    browser.get(f_url_l_a)
    time.sleep(random.randint(1, 2))
    url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
    js = 'window.location.href="{}";'.format(url)
    browser.execute_script(js)
    # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
        url_l = [i.attrs['href'] for i in bs.find_all('a')]
    res_l = []
    sql_l = []
    for i in url_l:
        if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
            # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
            # r = mysql_fetch(sql_chk)
            # if len(r) > 0:
            #     continue
            if i not in res_l:
                if i == url:
                    continue
                res_l.append(i)
                s = '("{}","{}","{}")'.format(browser.title, url, i)
                sql_l.append(s)
    if len(sql_l) > 0:
        sql = '{}{}'.format(sql, ','.join(sql_l))
        print(sql)
        mysql_write(sql)
    
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
    import random
    
    # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
    # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
    # https://so.gushiwen.org/app/
    # https://so.gushiwen.org/jiucuo.aspx?u=
    
    url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
    sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE   INSTR(page_title,'密码')>0 OR  INSTR(UPPER(page_url),'PWD')>0 OR  INSTR(UPPER(children_url),'PWD')>0) AS t);"
    sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE MYWHERE ) AS t);"
    print(sql_filter)
    sql_s_l = []
    for i in url_kw_filter_l:
        ii = i.upper()
        s = "  INSTR(UPPER(page_url),'{}')>0  OR  INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
        sql_s_l.append(s)
    sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l))
    
    while True:
        mysql_write(sql_filter)
    
        sql_ori = 'SELECT page_url,children_url FROM  parent_url WHERE if_spider=0'
    
        res = mysql_fetch(sql_ori, 'dic')
        jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
        for d in res:
            jump_c += 1
            if jump_c < jump_s:
                continue
            page_url, children_url = d['page_url'], d['children_url']
            url = children_url
    
            js = 'window.location.href="{}";'.format(url)
            browser.execute_script(js)
            time.sleep(1)
            browser.refresh()
            myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
            with open(myhtml, 'w', encoding='utf-8') as fw:
                fw.write(browser.page_source)
            sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
            with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
                bs = BeautifulSoup(myhtml_o, 'html.parser')
                try:
                    url_l = [i.attrs['href'] for i in bs.find_all('a')]
                except Exception as e:
                    print(e)
                    continue
            res_l = []
            sql_l = []
            for i in url_l:
                # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
                if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
                    # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
                    # print(sql_chk)
                    # r = mysql_fetch(sql_chk)
                    # print(r)
                    # if len(r) > 0:
                    #     continue
                    if i not in res_l:
                        if i == url:
                            continue
                        res_l.append(i)
                        s = '("{}","{}","{}")'.format(browser.title, url, i)
                        # sql_break = '{}{}'.format(sql, s)
                        # print(sql_break)
                        # mysql_write(sql_break)
    
                        # print(s)
                        sql_l.append(s)
            if len(sql_l) > 0:
                sql = '{}{}'.format(sql, ','.join(sql_l))
                print(sql)
                mysql_write(sql)
                sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
                print(sql_del)
                mysql_write(sql_del)
                sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
                                                                                                               url)
                mysql_write(sql_udp)
                print(sql_udp)
    time.sleep(3)
    
    dd = 0
    

      

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    import pymysql
    
    h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test'
    
    
    def mysql_fetch(sql, res_type='tuple'):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return ()
        if res_type == 'dic':
            cursor = conn.cursor(pymysql.cursors.DictCursor)
        else:
    
            cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return cursor.fetchall()
    
    
    def mysql_write(sql):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return 1
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return 0
    
    
    browser = webdriver.Chrome()
    f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
    f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
    browser.get(f_url_l_a)
    time.sleep(random.randint(1, 2))
    url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
    js = 'window.location.href="{}";'.format(url)
    browser.execute_script(js)
    # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
        url_l = [i.attrs['href'] for i in bs.find_all('a')]
    res_l = []
    sql_l = []
    for i in url_l:
        if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
            # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
            # r = mysql_fetch(sql_chk)
            # if len(r) > 0:
            #     continue
            if i not in res_l:
                if i == url:
                    continue
                res_l.append(i)
                s = '("{}","{}","{}")'.format(browser.title, url, i)
                sql_l.append(s)
    if len(sql_l) > 0:
        sql = '{}{}'.format(sql, ','.join(sql_l))
        print(sql)
        mysql_write(sql)
    
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
    import random
    
    # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
    # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
    # https://so.gushiwen.org/app/
    # https://so.gushiwen.org/jiucuo.aspx?u=
    
    url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
    sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE   INSTR(page_title,'密码')>0 OR  INSTR(UPPER(page_url),'PWD')>0 OR  INSTR(UPPER(children_url),'PWD')>0) AS t);"
    sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE MYWHERE ) AS t);"
    
    sql_s_l = []
    for i in url_kw_filter_l:
        ii = i.upper()
        s = "  INSTR(UPPER(page_url),'{}')>0  OR  INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
        sql_s_l.append(s)
    sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l))
    
    while True:
        mysql_write(sql_filter)
        print(sql_filter)
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
        sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
        url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
        # 乐观代码
        sql_ori = 'SELECT page_url,children_url FROM  parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
            ','.join(url_pass))
    
        res = mysql_fetch(sql_ori, 'dic')
    
        jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
        for d in res:
            jump_c += 1
            if jump_c < jump_s:
                continue
            page_url, children_url = d['page_url'], d['children_url']
            url = children_url
    
            js = 'window.location.href="{}";'.format(url)
            browser.execute_script(js)
            # time.sleep(1)
            # browser.refresh()
    
            try:
                for isc in range(1):
                    time.sleep(1)
                    js = 'window.scrollTo(0,document.body.scrollHeight)'
                    browser.execute_script(js)
            except Exception as e:
                print('window.scrollTo-->', e)
    
            myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
            with open(myhtml, 'w', encoding='utf-8') as fw:
                fw.write(browser.page_source)
            sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
            with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
                bs = BeautifulSoup(myhtml_o, 'html.parser')
                try:
                    url_l = [i.attrs['href'] for i in bs.find_all('a')]
                except Exception as e:
                    print(e)
                    continue
            res_l = []
            sql_l = []
            for i in url_l:
                # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
                if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
                    # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
                    # print(sql_chk)
                    # r = mysql_fetch(sql_chk)
                    # print(r)
                    # if len(r) > 0:
                    #     continue
                    if i not in res_l:
                        if i == url:
                            continue
    
                        continue_ = False
                        for fi in url_kw_filter_l:
                            ii = fi.upper()
                            if fi in i.upper():
                                continue_ = True
                                break
                        if continue_:
                            continue
    
                        res_l.append(i)
                        s = '("{}","{}","{}")'.format(browser.title, url, i)
                        # sql_break = '{}{}'.format(sql, s)
                        # print(sql_break)
                        # mysql_write(sql_break)
    
                        # print(s)
                        sql_l.append(s)
            if len(sql_l) > 0:
                sql = '{}{}'.format(sql, ','.join(sql_l))
                print(sql)
                mysql_write(sql)
    
                sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
                                                                                                               url)
                mysql_write(sql_udp)
                print(sql_udp)
    
    dd = 0
    

      

    from selenium import webdriver
    import time
    import random
    from bs4 import *
    import pymysql
    
    h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test'
    
    
    def mysql_fetch(sql, res_type='tuple'):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return ()
        if res_type == 'dic':
            cursor = conn.cursor(pymysql.cursors.DictCursor)
        else:
    
            cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return cursor.fetchall()
    
    
    def mysql_write(sql):
        global h, pt, u, p, db
        try:
            conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
        except Exception as e:
            print(e)
            return 1
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return 0
    
    
    browser = webdriver.Chrome()
    f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
    f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
    browser.get(f_url_l_a)
    time.sleep(random.randint(1, 2))
    url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
    js = 'window.location.href="{}";'.format(url)
    browser.execute_script(js)
    # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
    myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
    with open(myhtml, 'w', encoding='utf-8') as fw:
        fw.write(browser.page_source)
    sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
    with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
        bs = BeautifulSoup(myhtml_o, 'html.parser')
        url_l = [i.attrs['href'] for i in bs.find_all('a')]
    res_l = []
    sql_l = []
    for i in url_l:
        break
        if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
            # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
            # r = mysql_fetch(sql_chk)
            # if len(r) > 0:
            #     continue
            if i not in res_l:
                if i == url:
                    continue
                res_l.append(i)
                s = '("{}","{}","{}")'.format(browser.title, url, i)
                sql_l.append(s)
    if len(sql_l) > 0:
        sql = '{}{}'.format(sql, ','.join(sql_l))
        print(sql)
        mysql_write(sql)
    
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
    import random
    
    # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
    # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
    # https://so.gushiwen.org/app/
    # https://so.gushiwen.org/jiucuo.aspx?u=
    
    url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
    sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE   INSTR(page_title,'密码')>0 OR  INSTR(UPPER(page_url),'PWD')>0 OR  INSTR(UPPER(children_url),'PWD')>0) AS t);"
    sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM  parent_url  WHERE MYWHERE ) AS t);"
    
    sql_s_l = []
    for i in url_kw_filter_l:
        ii = i.upper()
        s = "  INSTR(UPPER(page_url),'{}')>0  OR  INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
        sql_s_l.append(s)
    sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l))
    
    while True:
        mysql_write(sql_filter)
        print(sql_filter)
        sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT  max_id FROM ( SELECT  MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab  WHERE c>1 )'
        print(sql_del)
        mysql_write(sql_del)
    
        sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
        url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
        # 乐观代码
        sql_ori = 'SELECT page_url,children_url FROM  parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
            ','.join(url_pass))
    
        res = mysql_fetch(sql_ori, 'dic')
    
        jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
        for d in res:
            jump_c += 1
            if jump_c < jump_s:
                continue
            page_url, children_url = d['page_url'], d['children_url']
            url = children_url
    
            js = 'window.location.href="{}";'.format(url)
            browser.execute_script(js)
            # time.sleep(1)
            # browser.refresh()
    
            try:
                for isc in range(1):
                    time.sleep(1)
                    js = 'window.scrollTo(0,document.body.scrollHeight)'
                    browser.execute_script(js)
            except Exception as e:
                print('window.scrollTo-->', e)
    
            myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999))
            with open(myhtml, 'w', encoding='utf-8') as fw:
                fw.write(browser.page_source)
            sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
            with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
                bs = BeautifulSoup(myhtml_o, 'html.parser')
                try:
                    url_l = [i.attrs['href'] for i in bs.find_all('a')]
                except Exception as e:
                    print(e)
                    continue
            res_l = []
            sql_l = []
            for i in url_l:
                # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
                if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
                    # sql_chk = 'SELECT * FROM  parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
                    # print(sql_chk)
                    # r = mysql_fetch(sql_chk)
                    # print(r)
                    # if len(r) > 0:
                    #     continue
                    if i not in res_l:
                        if i == url:
                            continue
    
                        continue_ = False
                        for fi in url_kw_filter_l:
                            ii = fi.upper()
                            if fi in i.upper():
                                continue_ = True
                                break
                        if continue_:
                            continue
    
                        res_l.append(i)
                        s = '("{}","{}","{}")'.format(browser.title, url, i)
                        # sql_break = '{}{}'.format(sql, s)
                        # print(sql_break)
                        # mysql_write(sql_break)
    
                        # print(s)
                        sql_l.append(s)
            if len(sql_l) > 0:
                sql = '{}{}'.format(sql, ','.join(sql_l))
                print(sql)
                mysql_write(sql)
    
                sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
                                                                                                               url)
                mysql_write(sql_udp)
                print(sql_udp)
    
    dd = 0
    

      

    (父,子)url有序二元组

  • 相关阅读:
    c语言知识
    数字地与模拟地
    C语言实现顺序表(增删)
    传统数据库、Nosql数据库与云数据库区别?
    大数据处理架构如何
    warning: implicit declaration of function 'func1' [-Wimplicit-function-declaration]
    window10创建virtualenv虚拟环境
    二叉树的实现以及三种遍历方法--代码
    损失函数--KL散度与交叉熵
    市场回测与对冲套利
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8929063.html
Copyright © 2020-2023  润新知