• 阿X妈X团长信息采集脚本


    某宝使用Chrome 自动化会被检测,所以采用Firefox浏览器做自动化,运行此脚本请按照Firefox和将驱动文件放在运行目录中的driversgeckodriver.exe路径

    # -*- coding: utf-8 -*-
    # @Time : 2020-4-22 22:39
    # @Author : hlikex
    # @File : main.py
    
    from selenium import webdriver
    from urllib import parse
    from selenium.webdriver.chrome.options import Options
    import logging
    import time
    import random
    import pandas as pd
    import os
    import re
    
    
    class Taobao:
    
        def __init__(self):
            self.root_url = "https://ad.alimama.com/zhaoshang/cpevent/index.htm?srcCode=1&pageNo=1&onlyCanJoin=0&pageSize=800"
            if os.path.exists("data.xls"):
                self.df = pd.read_excel("data.xls",index_col=0)
            else:
                self.df = pd.DataFrame(columns=("团长",'旺旺','V标团长','总成交金额(星级)',"总成交笔数(星级)",'打爆能力(星级)','服务时间','钉钉','团队介绍'))
            self.driver = webdriver.Firefox(executable_path=os.getcwd() + "driversgeckodriver.exe")
    
            self.driver.implicitly_wait(10)
            logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
                                level=logging.WARNING)
    
        def isElementExist(self,id, element):
            try:
                self.driver.find_element(id,element)
                return True
            except:
                return False
    
        def login(self):
            self.driver.get(self.root_url)
            try:
                self.driver.switch_to.frame('taobaoLoginIfr')
                self.driver.find_element_by_id('fm-login-id').send_keys("username")   # 你的账号
                self.driver.find_element_by_id('fm-login-password').send_keys('password') # 你的密码
                self.driver.find_element_by_css_selector('[class="fm-button fm-submit password-login"]').click()
                time.sleep(3)
                print(self.driver.current_url)
                self.driver.switch_to.default_content()
                self.driver.refresh()
    time.sleep(2.5)

    except Exception as e: print(e) def getInfo(self): elements = self.driver.find_elements_by_tag_name('tr') for element in elements: try: html = element.get_attribute("outerHTML") Name = re.findall(r"{cpName:'(.*?)',cpPubId:'.*?'}",html)[0] list = self.df["团长"].values.tolist() if Name in list: continue WUid = re.findall(r'uid=(.*?)&',html)[1] star = re.findall(r'app/gallery/mx-effects/star?num=(d)',html) if str(html).find('V标团长')!=-1: V = "" else: V = "" element.find_element_by_css_selector('[class="card pointer"]').click() time.sleep(2) html2 = self.driver.find_element_by_css_selector('[class="adv-threebq adv-threebu"]').get_attribute("outerHTML") ServiceDate = re.findall(r"服务时间:</div><div>(.*?)</div>",html2)[0] dings = re.findall(r'dingtalk_id=(.*?)"',html2) ding = "".join(str(parse.unquote(i)) for i in dings) # self.driver.find_element_by_class_name('word-break').get_attribute("outerHTML") about = self.driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div[1]/div/div[2]/div[3]/p[2]').text if str(about).find('...更多')!=-1: self.driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div[1]/div/div[2]/div[3]/p[2]/span').click() about = self.driver.find_element_by_xpath('/html/body/div[5]/div/div/div/div[1]/div[2]').get_attribute('outerHTML') about = re.sub(r'<.*?>', "", about) self.driver.find_element_by_link_text('确定').click() print({"团长":Name,'旺旺':WUid,'V标团长':V,'总成交金额(星级)': star[0],"总成交笔数(星级)": star[1],'打爆能力(星级)': star[2],'服务时间':ServiceDate,'钉钉':ding,'团队介绍':about}) num = len(self.df) df2 = pd.DataFrame( {"团长":Name,'旺旺':WUid,'V标团长':V,'总成交金额(星级)': star[0],"总成交笔数(星级)": star[1],'打爆能力(星级)': star[2],'服务时间':ServiceDate,'钉钉':ding,'团队介绍':about}, index=[num + 1]) self.driver.switch_to.default_content() time.sleep(1.5) self.driver.find_element_by_link_text('关闭').click() self.df = self.df.append(df2) self.df.to_excel("data.xls") except IndexError as e: pass except Exception as e : if self.isElementExist('link text',"确定"): self.driver.find_element_by_link_text('关闭').click() if self.isElementExist('link text',"关闭"): self.driver.find_element_by_link_text('关闭').click() self.df.to_excel("data.xls") print(e) def nextPage(self): try: self.driver.switch_to.default_content() self.driver.find_element_by_css_selector('[class="mc-iconfont adv-threefY rotate180 "]').click() except Exception as e: print(e) def run(self): self.login() while True: try: page = re.findall(r"pageNo=(.*?)&",str(self.driver.current_url))[0] except IndexError: print("用户被注销") print("当前页码:{}".format(page)) self.getInfo() time.sleep(60) self.driver.refresh() if __name__ == '__main__': T = Taobao() T.run()
  • 相关阅读:
    异常处理机制中的return关键字
    QuickHit 项目
    kali2.0升级
    sslscan
    RC4弱密码套件检测
    CVE-2017-11882漏洞复现
    应急响应小总结
    服务器日志分析
    《x86汇编语言:从实模式到保护模式 》学习笔记之:第一次编写汇编语言
    nasm不是内部或外部命令
  • 原文地址:https://www.cnblogs.com/hlikex/p/12768185.html
Copyright © 2020-2023  润新知