蜘蛛页面
from selenium import webdriver import time import random from bs4 import * import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return cursor.fetchall() def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 browser = webdriver.Chrome() f_url_l = ['https://www.baidu.com/', 'https://www.so.com/'] f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)] browser.get(f_url_l_a) time.sleep(random.randint(1, 2)) url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx' js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg' myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') url_l = [i.attrs['href'] for i in bs.find_all('a')] res_l = [] sql_l = [] for i in url_l: if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # r = mysql_fetch(sql_chk) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) while True: sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 ORDER BY id DESC ' res = mysql_fetch(sql_ori, 'dic') for d in res: page_url, children_url = d['page_url'], d['children_url'] url = children_url js = 'window.location.href="{}";'.format(url) browser.execute_script(js) time.sleep(1) browser.refresh() myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') try: url_l = [i.attrs['href'] for i in bs.find_all('a')] except Exception as e: print(e) continue res_l = [] sql_l = [] for i in url_l: # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # print(sql_chk) # r = mysql_fetch(sql_chk) # print(r) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) # sql_break = '{}{}'.format(sql, s) # print(sql_break) # mysql_write(sql_break) # print(s) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url, url) mysql_write(sql_udp) print(sql_udp) time.sleep(3) dd = 0
CREATE TABLE `parent_url` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`page_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
`page_url` text,
`children_url` text,
`if_spider` tinyint(4) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=5328 DEFAULT CHARSET=latin1;
先写入,后删除
避免每个写入前的检查
消耗时间
获取一个网站的全部url
修复逻辑错误
支持 多进程 脚本多开
from selenium import webdriver import time import random from bs4 import * import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return cursor.fetchall() def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 browser = webdriver.Chrome() f_url_l = ['https://www.baidu.com/', 'https://www.so.com/'] f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)] browser.get(f_url_l_a) time.sleep(random.randint(1, 2)) url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx' js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg' myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') url_l = [i.attrs['href'] for i in bs.find_all('a')] res_l = [] sql_l = [] for i in url_l: if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # r = mysql_fetch(sql_chk) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx # https://so.gushiwen.org/app/ url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/'] while True: sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);" mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0' res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10)) for d in res: jump_c += 1 if jump_c < jump_s: continue page_url, children_url = d['page_url'], d['children_url'] url = children_url continue_ = False for fl in url_kw_filter_l: if fl in url: continue_ = True break if continue_: continue js = 'window.location.href="{}";'.format(url) browser.execute_script(js) time.sleep(1) browser.refresh() myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') try: url_l = [i.attrs['href'] for i in bs.find_all('a')] except Exception as e: print(e) continue res_l = [] sql_l = [] for i in url_l: # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # print(sql_chk) # r = mysql_fetch(sql_chk) # print(r) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) # sql_break = '{}{}'.format(sql, s) # print(sql_break) # mysql_write(sql_break) # print(s) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url, url) mysql_write(sql_udp) print(sql_udp) time.sleep(3) dd = 0
代码的每一个功能点的模块化
from selenium import webdriver import time import random from bs4 import * import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return cursor.fetchall() def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 browser = webdriver.Chrome() f_url_l = ['https://www.baidu.com/', 'https://www.so.com/'] f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)] browser.get(f_url_l_a) time.sleep(random.randint(1, 2)) url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx' js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg' myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') url_l = [i.attrs['href'] for i in bs.find_all('a')] res_l = [] sql_l = [] for i in url_l: if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # r = mysql_fetch(sql_chk) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx # https://so.gushiwen.org/app/ # https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx'] sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);" sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" print(sql_filter) sql_s_l = [] for i in url_kw_filter_l: ii = i.upper() s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii) sql_s_l.append(s) sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True: mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0' res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10)) for d in res: jump_c += 1 if jump_c < jump_s: continue page_url, children_url = d['page_url'], d['children_url'] url = children_url js = 'window.location.href="{}";'.format(url) browser.execute_script(js) time.sleep(1) browser.refresh() myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') try: url_l = [i.attrs['href'] for i in bs.find_all('a')] except Exception as e: print(e) continue res_l = [] sql_l = [] for i in url_l: # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # print(sql_chk) # r = mysql_fetch(sql_chk) # print(r) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) # sql_break = '{}{}'.format(sql, s) # print(sql_break) # mysql_write(sql_break) # print(s) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url, url) mysql_write(sql_udp) print(sql_udp) time.sleep(3) dd = 0
from selenium import webdriver import time import random from bs4 import * import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return cursor.fetchall() def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 browser = webdriver.Chrome() f_url_l = ['https://www.baidu.com/', 'https://www.so.com/'] f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)] browser.get(f_url_l_a) time.sleep(random.randint(1, 2)) url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx' js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg' myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') url_l = [i.attrs['href'] for i in bs.find_all('a')] res_l = [] sql_l = [] for i in url_l: if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # r = mysql_fetch(sql_chk) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx # https://so.gushiwen.org/app/ # https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx'] sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);" sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = [] for i in url_kw_filter_l: ii = i.upper() s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii) sql_s_l.append(s) sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True: mysql_write(sql_filter) print(sql_filter) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url' url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')] # 乐观代码 sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format( ','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10)) for d in res: jump_c += 1 if jump_c < jump_s: continue page_url, children_url = d['page_url'], d['children_url'] url = children_url js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # time.sleep(1) # browser.refresh() try: for isc in range(1): time.sleep(1) js = 'window.scrollTo(0,document.body.scrollHeight)' browser.execute_script(js) except Exception as e: print('window.scrollTo-->', e) myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') try: url_l = [i.attrs['href'] for i in bs.find_all('a')] except Exception as e: print(e) continue res_l = [] sql_l = [] for i in url_l: # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # print(sql_chk) # r = mysql_fetch(sql_chk) # print(r) # if len(r) > 0: # continue if i not in res_l: if i == url: continue continue_ = False for fi in url_kw_filter_l: ii = fi.upper() if fi in i.upper(): continue_ = True break if continue_: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) # sql_break = '{}{}'.format(sql, s) # print(sql_break) # mysql_write(sql_break) # print(s) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url, url) mysql_write(sql_udp) print(sql_udp) dd = 0
from selenium import webdriver import time import random from bs4 import * import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return cursor.fetchall() def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 browser = webdriver.Chrome() f_url_l = ['https://www.baidu.com/', 'https://www.so.com/'] f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)] browser.get(f_url_l_a) time.sleep(random.randint(1, 2)) url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx' js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg' myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') url_l = [i.attrs['href'] for i in bs.find_all('a')] res_l = [] sql_l = [] for i in url_l: break if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # r = mysql_fetch(sql_chk) # if len(r) > 0: # continue if i not in res_l: if i == url: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx # https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx # https://so.gushiwen.org/app/ # https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx'] sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);" sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = [] for i in url_kw_filter_l: ii = i.upper() s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii) sql_s_l.append(s) sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True: mysql_write(sql_filter) print(sql_filter) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )' print(sql_del) mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url' url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')] # 乐观代码 sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format( ','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10)) for d in res: jump_c += 1 if jump_c < jump_s: continue page_url, children_url = d['page_url'], d['children_url'] url = children_url js = 'window.location.href="{}";'.format(url) browser.execute_script(js) # time.sleep(1) # browser.refresh() try: for isc in range(1): time.sleep(1) js = 'window.scrollTo(0,document.body.scrollHeight)' browser.execute_script(js) except Exception as e: print('window.scrollTo-->', e) myhtml = 'D:\myhtml\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') try: url_l = [i.attrs['href'] for i in bs.find_all('a')] except Exception as e: print(e) continue res_l = [] sql_l = [] for i in url_l: # /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http': # sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i) # print(sql_chk) # r = mysql_fetch(sql_chk) # print(r) # if len(r) > 0: # continue if i not in res_l: if i == url: continue continue_ = False for fi in url_kw_filter_l: ii = fi.upper() if fi in i.upper(): continue_ = True break if continue_: continue res_l.append(i) s = '("{}","{}","{}")'.format(browser.title, url, i) # sql_break = '{}{}'.format(sql, s) # print(sql_break) # mysql_write(sql_break) # print(s) sql_l.append(s) if len(sql_l) > 0: sql = '{}{}'.format(sql, ','.join(sql_l)) print(sql) mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url, url) mysql_write(sql_udp) print(sql_udp) dd = 0
(父,子)url有序二元组