• papunika


    import os
    import ssl
    import sys
    import time
    
    import pymysql
    import undetected_chromedriver as uc
    from selenium import webdriver
    
    path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    sys.path.append(path)
    from spider_setting import MYSQL_HOST, MYSQL_POST, MYSQL_PASSWORD, MYSQL_USER
    
    
    class Papunika(object):
        def __init__(self):
            self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database="cloud_joy_monitoring", user=MYSQL_USER, password=MYSQL_PASSWORD, charset='utf8', autocommit=True)
            self.cursor = self.db.cursor()
    
            self.main()
    
        def main(self):
            # 浏览器选项
            chrome_options = webdriver.ChromeOptions()
            # 使用headless无界面浏览器模式
            # chrome_options.add_argument('--headless')
            # 解决DevToolsActivePort文件不存在的报错
            chrome_options.add_argument('--no-sandbox')
            # 谷歌文档提到需要加上这个属性来规避bug
            chrome_options.add_argument('--disable-gpu')
            # 设置默认编码为utf-8
            chrome_options.add_argument('--lang=zh-CN')
            # chrome_options.add_argument('disable-cache')
            chrome_options.add_argument('--disable-javascript')
            chrome_options.add_argument('--disable-java')
    
            prefs = {
                'profile.default_content_setting_values': {
                    'images': 2,
                    'javascript': 2  # 2即为禁用的意思
                }
            }
            chrome_options.add_experimental_option('prefs', prefs)
    
            # 隐藏滚动条, 应对一些特殊页面
            chrome_options.add_argument('--hide-scrollbars')
            chrome_options.add_argument("--proxy-server=192.168.104.134:7890")
            # chrome_options.add_argument('–user-data-dir=C:/Users/cf.yu/AppData/Local/Google/Chrome/User Data')
            # chrome_options.add_argument('--profile-directory=Default')
            # 禁止加载图片
            chrome_options.add_argument('blink-settings=imagesEnabled=false')
            # 指定浏览器分辨率
            chrome_options.add_argument('--start-maximized')
            ssl._create_default_https_context = ssl._create_unverified_context
    
            uc.TARGET_VERSION = 101
            # driver = uc.Chrome(options=chrome_options)
            driver = webdriver.Chrome(options=chrome_options)
            driver.get('https://papunika.com/')
            self.cursor.execute('select id, url from papunika_url order by id')
            for data in self.cursor.fetchall():
                url = data[1]
                if "https://papunika.com/" in url and url.endswith("/"):
                    print(data)
                    key = url.replace("https://papunika.com/", "")[:-1]
                    if not key:
                        key = "index"
                    if not os.path.exists("E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(key)):  #  or 1 == 1
                        driver.get(url)
                        time.sleep(1)
                        handles = driver.window_handles
                        driver.switch_to.window(handles[-1])
                        time.sleep(1)
                        # driver.execute_script("var leafArr = $('.leaflet-tooltip'); leafArr.each(function(){$(this).attr('name',$(this).text())})")
                        driver.execute_script("document.querySelectorAll('.nk-gap-2, .code-block,#BorlabsCookieBoxWrap,#BorlabsCookieBox, #menu-item-9627, #menu-item-10519, #borlabs-cookie-js-after').forEach(node=>node.remove())")
                        url = driver.current_url
                        key = url.replace("https://papunika.com/", "")[:-1]
                        if not key:
                            key = "index"
                        print("js执行完成:{}".format(url))
                        time.sleep(1)
                        page = driver.page_source
                        save_path = self.save_path(key, False)
                        print(url, save_path, handles)
                        path = '/'.join(save_path.split("/")[:-1])
                        try:
                            self.save_file(save_path, page, path)
                        except Exception as e:
                            print("错误:{}".format(e))
                            continue
                    # break
    
            driver.close()
    
        def save_path(self, number, status):
            if status:
                save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_cn/{}.html".format(number)
            else:
                save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(number)
    
            return save_path
    
        def save_file(self, file_name, page, path):
            if not os.path.exists(path):
                os.makedirs(path)
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write("<!DOCTYPE html>\n")
                f.write(page)
    
    
    if __name__ == "__main__":
        Papunika()
    
    '''
    html = etree.HTML(page)
                for i in range(5):
                    content = html.xpath('//*[@id="BorlabsCookieBox"]')
                    if content:
                        data = etree.tostring(content[0], encoding="utf-8").decode("utf-8")
                        for i in re.findall(r'(<.*?>)', data):
                            page = page.replace(i, "")
                        print(data)
                    text = html.xpath('//*[@id="BorlabsCookieBox"]//text')
                    if text:
                        for i in text:
                            page = page.replace(i, "")
                        print(data)
    '''
    

      

  • 相关阅读:
    百度富文本编辑器的上传图片的路径问题
    laravel初次学习总结及一些细节
    macOS apache配置及开启虚拟服务器的开启,apache开启重写模式
    类似于qq空间类型的评论和回复
    向php提交数据及json
    mac 初次配置apache,及mac下安装mysql
    C#连接mysql数据库插入数据后获取自增长主键ID值
    PHP 真正多线程的使用
    C# 连接mysql数据库
    MySql状态查看方法 MySql如何查看连接数和状态?
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/16228456.html
Copyright © 2020-2023  润新知