• Amazon后台模拟登陆


    本文基于python3.4的selenium库打开浏览器,并将浏览器中的登陆cookie保存到本地,那么下次登陆就可以直接利用cookie了:

      1 # !/usr/bin/python3.4
      2 # -*- coding: utf-8 -*-
      3 
      4 from selenium import webdriver
      5 import time
      6 import requests
      7 from bs4 import BeautifulSoup
      8 import os
      9 import re
     10 import random
     11 import xlsxwriter
     12 
     13 
     14 # 找出文件夹下所有xml后缀的文件,可选择递归
     15 def listfiles(rootdir, prefix='.xml', iscur=False):
     16     file = []
     17     for parent, dirnames, filenames in os.walk(rootdir):
     18         if parent == rootdir:
     19             for filename in filenames:
     20                 if filename.endswith(prefix):
     21                     file.append(filename)
     22             if not iscur:
     23                 return file
     24         else:
     25             if iscur:
     26                 for filename in filenames:
     27                     if filename.endswith(prefix):
     28                         file.append(filename)
     29             else:
     30                 pass
     31     return file
     32 
     33 
     34 # 抓取dp的正则表达式
     35 def getdp(string):
     36     reg = r'(http.+?/dp/)(.+)'
     37     all = re.compile(reg)
     38     alllist = re.findall(all, string)
     39     return alllist[0][1]
     40 
     41 
     42 # 抓取filter的正则表达式
     43 # https://sellercentral.amazon.com/productsearch?filter=grocery&q=fish
     44 def getfilter(string):
     45     reg = r'(https.+?filter=)(.+?)(&)'
     46     all = re.compile(reg)
     47     alllist = re.findall(all, string)
     48     return alllist[0][1]
     49 
     50 
     51 # 抓取最大页数的正则
     52 def getpagenum(string):
     53     reg = r'(.+?()(d+)())'
     54     all = re.compile(reg)
     55     alllist = re.findall(all, string)
     56     return alllist[0][1]
     57 
     58 
     59 # 创建文件夹
     60 def createjia(path):
     61     try:
     62         os.makedirs(path)
     63     except:
     64         pass
     65 
     66 
     67 def timetochina(longtime, formats='{}天{}小时{}分钟{}秒'):
     68     day = 0
     69     hour = 0
     70     minutue = 0
     71     second = 0
     72     try:
     73         if longtime > 60:
     74             second = longtime % 60
     75             minutue = longtime // 60
     76         else:
     77             second = longtime
     78         if minutue > 60:
     79             hour = minutue // 60
     80             minutue = minutue % 60
     81         if hour > 24:
     82             day = hour // 24
     83             hour = hour % 24
     84         return formats.format(day, hour, minutue, second)
     85     except:
     86         raise Exception('时间非法')
     87 
     88 
     89 # 打开浏览器抓取cookie
     90 def openbrowser(url):
     91     # 打开谷歌浏览器
     92     # Firefox() Chrome()
     93     browser = webdriver.Chrome()
     94     # browser = webdriver.Chrome(executable_path='C:/Python34/chromedriver.exe')
     95     # 输入网址
     96     browser.get(url)
     97     # 打开浏览器时间
     98     # print("等待10秒打开浏览器...")
     99     # time.sleep(10)
    100 
    101     # 找到id="ap_email"的对话框
    102     # 清空输入框
    103     browser.find_element_by_id("ap_email").clear()
    104     browser.find_element_by_id("ap_password").clear()
    105 
    106     # 输入账号密码
    107     inputemail = input("请输入账号:")
    108     inputpassword = input("请输入密码:")
    109     browser.find_element_by_id("ap_email").send_keys(inputemail)
    110     browser.find_element_by_id("ap_password").send_keys(inputpassword)
    111 
    112     # 点击登陆sign in
    113     # id="signInSubmit"
    114     browser.find_element_by_id("signInSubmit").click()
    115 
    116     # 等待登陆10秒
    117     # print('等待登陆10秒...')
    118     # time.sleep(10)
    119     print("等待网址加载完毕...")
    120 
    121     select = input("请观察浏览器网站是否已经登陆(y/n):")
    122     while 1:
    123         if select == "y" or select == "Y":
    124             print("登陆成功!")
    125             # 获取cookie
    126             cookie = [item["name"] + ":" + item["value"] for item in browser.get_cookies()]
    127             cookiestr = ';'.join(item for item in cookie)
    128             print("正在复制网页cookie...")
    129 
    130             # 写入本地txt
    131             if "jp" in url:
    132                 path = "../data/Japcookie.txt"
    133             else:
    134                 path = "../data/Amecookie.txt"
    135 
    136             filecookie = open(path, "w")
    137             filecookie.write(cookiestr)
    138             filecookie.close()
    139 
    140             time.sleep(1)
    141             print("准备关闭浏览器...")
    142             browser.quit()
    143             # print(cookiestr)
    144             break
    145 
    146         elif select == "n" or select == "N":
    147             selectno = input("账号密码错误请按0,验证码出现请按1...")
    148             # 账号密码错误则重新输入
    149             if selectno == "0":
    150 
    151                 # 找到id="ap_email"的对话框
    152                 # 清空输入框
    153                 browser.find_element_by_id("ap_email").clear()
    154                 browser.find_element_by_id("ap_password").clear()
    155 
    156                 # 输入账号密码
    157                 inputemail = input("请输入账号:")
    158                 inputpassword = input("请输入密码:")
    159                 browser.find_element_by_id("ap_email").send_keys(inputemail)
    160                 browser.find_element_by_id("ap_password").send_keys(inputpassword)
    161                 # 点击登陆sign in
    162                 # id="signInSubmit"
    163                 browser.find_element_by_id("signInSubmit").click()
    164 
    165             elif selectno == "1":
    166                 # 验证码的id为id="ap_captcha_guess"的对话框
    167                 input("请在浏览器中输入验证码并登陆...")
    168                 select = input("请观察浏览器网站是否已经登陆(y/n):")
    169 
    170         else:
    171             print("请输入“y”或者“n”!")
    172             select = input("请观察浏览器网站是否已经登陆(y/n):")
    173 
    174     return cookiestr
    175 
    176 
    177 def gethtml(url):
    178     # 读取cookie
    179     # 写入字典
    180     mycookie = {}
    181     if "jp" in url:
    182         path = "../data/Japcookie.txt"
    183     else:
    184         path = "../data/Amecookie.txt"
    185 
    186     try:
    187         filecookie = open(path, "r")
    188         cookies = filecookie.read().split(";")
    189         for items in cookies:
    190             item = items.split(":")
    191             mycookie[item[0]] = item[1]
    192         # print(mycookie)
    193         filecookie.close()
    194     except:
    195         print("cookie为空...")
    196 
    197     if "jp" in url:
    198         referer = "https://sellercentral.amazon.co.jp/"
    199         host = "www.amazon.co.jp"
    200     else:
    201         referer = "https://sellercentral.amazon.com/"
    202         host = "www.amazon.com"
    203 
    204     # 制作头部
    205     header = {
    206         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
    207         'Referer': referer,
    208         'Host': host,
    209         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    210         'Connection': 'keep-alive',
    211         'Upgrade-Insecure-Requests': '1',
    212         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    213         'Accept-Encoding': 'gzip, deflate, br'
    214     }
    215 
    216     htmlget = requests.get(url=url, headers=header, cookies=mycookie, timeout=60)
    217     htmlcontent = htmlget.content.decode("UTF-8", "ignore")
    218 
    219     return htmlcontent
    220 
    221 
    222 def getinfo(html, Loginurl):
    223     # BeautifulSoup解析需要的东西
    224     soups = BeautifulSoup(html, "html.parser")
    225     # 筛选出商品的div
    226     sellyours = soups.find_all("div", attrs={"class": "a-box product"})
    227     information = []
    228     for item in sellyours:
    229         # 一个一个商品筛选
    230         # 第一次筛选,得到有“出售您的”的商品列表
    231         temp = item.find("a", attrs={"class", "a-button-text"})
    232 
    233         if temp != None:
    234             if "sellYoursClick" in temp["data-csm"]:
    235                 # 第二次筛选得到“无数字、无新品”字样的商品列表
    236                 temp = item.find("span", attrs={"class", "offerCountDetails"})
    237                 if temp == None:
    238                     temp = item.find("div", attrs={"class", "a-fixed-right-grid-col description a-col-left"})
    239 
    240                     # 得到详情页网址
    241                     hrefurl = temp.find('a').get('href')
    242                     # 得到所有当前class下的文本信息
    243                     # 包括title、UPC、EAN、Rank
    244                     try:
    245                         spans = temp.get_text()
    246                     except:
    247                         spans = "Nothing"
    248                     # 将得到的文本信息写入数组里面
    249                     temparr = spans.strip().split("
    ")
    250                     # 正则得到Asin
    251                     asin = getdp(hrefurl)
    252                     temparr.append(asin)
    253                     temparr.append(hrefurl)
    254 
    255                     # 这里记录一份副本到txt中,防止程序中断什么都没保存
    256                     txtcontent = ' '.join(temparr)
    257                     filename = time.strftime('%Y%m%d', time.localtime())
    258                     path = "../xls/" + filename
    259                     createjia(path)
    260                     file = open(path + "/" + filename + ".txt", "a")
    261                     file.write("
    " + txtcontent)
    262                     file.close()
    263 
    264                     # 这里是解析详情页,如果详情页有price,就抓取review下来
    265                     # 并且将抓取的东西储存到数组,并写入excel中
    266                     # 解析详情页
    267                     htmldetail = gethtml(hrefurl)
    268 
    269                     if 'id="words"' in htmldetail or 'ap_email' in htmldetail or "Amazon.com Page Not Found" in htmldetail:
    270                         print("抓取得太快!需要重新登陆...")
    271                         openbrowser(Loginurl)
    272                         htmldetail = gethtml(hrefurl)
    273 
    274                     # BeautifulSoup解析需要的东西
    275                     soups = BeautifulSoup(htmldetail, "html.parser")
    276                     # 筛选出商品的centerCol列表
    277                     centerCols = soups.findAll('div', attrs={'id': "centerCol"})
    278                     if centerCols:
    279                         for item in centerCols:
    280                             temp = item.find("td", attrs={"id": "priceblock_ourprice_lbl"})
    281                             if temp == None:
    282                                 # 得到评分等级
    283                                 star = item.find("a", attrs={"id": "reviewStarsLinkedCustomerReviews"}).get_text()
    284                                 # 得到评分人数
    285                                 reviews = item.find("span", attrs={"id": "acrCustomerReviewText"}).get_text()
    286                                 # 将抓取的东西写入数组
    287                                 if star:
    288                                     temparr.append(star.strip().replace(" out of 5 stars", ""))
    289                                 else:
    290                                     temparr.append("")
    291                                 if reviews:
    292                                     temparr.append(reviews.strip().replace(" customer reviews", ""))
    293                                 else:
    294                                     temparr.append("")
    295 
    296                                 information.append(temparr)
    297                                 print(information)
    298                     else:
    299                         temparr.append("")
    300                         temparr.append("")
    301                         information.append(temparr)
    302                         print(information)
    303     return information
    304 
    305 
    306 def begin():
    307     taoyanbai = '''
    308             -----------------------------------------
    309             | 欢迎使用后台爬虫系统                   |
    310             | 时间:2016年10月21日                  |
    311             | 出品:技术部                          |
    312             -----------------------------------------
    313         '''
    314     print(taoyanbai)
    315 
    316 
    317 if __name__ == "__main__":
    318 
    319     a = time.clock()
    320 
    321     while 1:
    322         try:
    323             LoginWhere = int(input("抓取美国请按0,日本请按1:"))
    324             if LoginWhere == 0:
    325                 Loginurl = "https://sellercentral.amazon.com/"
    326                 break
    327             elif LoginWhere == 1:
    328                 Loginurl = "https://sellercentral.amazon.co.jp/"
    329                 break
    330         except:
    331             print("请正确输入0或1!!")
    332             LoginWhere = int(input("抓取美国请按0,日本请按1:"))
    333 
    334     keywords = input("请输入查找的关键词:")
    335     keyword = keywords.replace(" ", "+")
    336 
    337     print("正在检查登陆状态...")
    338 
    339     if "jp" in Loginurl:
    340         seekurl = "https://sellercentral.amazon.co.jp/productsearch?q=" + str(keyword)
    341     else:
    342         seekurl = "https://sellercentral.amazon.com/productsearch?q=" + str(keyword)
    343 
    344     try:
    345         htmlpage = gethtml(seekurl)
    346     except Exception as err:
    347         input("网络似乎有点问题...")
    348         print(err)
    349         exit()
    350 
    351     while 1:
    352         if 'ap_email' in htmlpage or "Amazon.com Page Not Found" in htmlpage or "<title>404" in htmlpage:
    353             print("cookie已经过期,需要重新登陆...")
    354             print("等待网页打开...")
    355             openbrowser(Loginurl)
    356             htmlpage = gethtml(seekurl)
    357         else:
    358             print("直接使用cookie登陆...")
    359             break
    360 
    361     # BeautifulSoup解析需要的东西
    362     soups = BeautifulSoup(htmlpage, "html.parser")
    363     # 筛选出类别及其网址
    364     categorys = soups.findAll('ul', attrs={'class': "a-nostyle a-vertical"})
    365     categoryurl = []
    366     categoryname = ""
    367     pagenum = []
    368     filtername = []
    369 
    370     for item in categorys:
    371         for temp in item.find_all("a"):
    372             hrefurl = temp.get('href')
    373             categoryurl.append(hrefurl)
    374 
    375         for temp in item.find_all("span", attrs={"class", "a-color-tertiary"}):
    376             spantext = temp.get_text()
    377             pagenum.append(getpagenum(spantext))
    378     for i in range(0, len(categoryurl)):
    379         name = getfilter(categoryurl[i])
    380         filtername.append(name)
    381         categoryname = categoryname + "抓取(" + str(name) + ")请按" + str(i) + ","
    382 
    383     # 选择抓取的类型
    384     try:
    385         print(categoryname)
    386         selectcategory = int(input("请选择你要抓取类型的数字号码:"))
    387     except:
    388         print("请正确输入前面的数字!!!")
    389         print(categoryname)
    390         selectcategory = int(input("请选择你要抓取类型的数字编码:"))
    391 
    392     filter = filtername[selectcategory]
    393     mustpage = int(pagenum[selectcategory]) // 10
    394 
    395     try:
    396         print("温馨提醒:(1)后台仅仅展现1000页...(2)你要抓取的类型大约有" + str(mustpage) + "页...")
    397         page = int(input("请问你要抓取多少页?(默认15页):"))
    398         if page > 1000:
    399             print("后台最多只能看到1000页!!!")
    400             page = int(input("后台仅仅展现1000页!!!你要抓取的类型大约有" + str(mustpage) + "页!!!请问你要抓取多少页?(默认15页):"))
    401     except:
    402         page = 15
    403 
    404     # 储存抓取到的东西
    405     information = []
    406     temparr = []
    407 
    408     for i in range(0, page):
    409         try:
    410             if "jp" in Loginurl:
    411                 # https://sellercentral.amazon.co.jp/productsearch?filter=sporting&q=空気入れ&page=2
    412                 openurl = "https://sellercentral.amazon.co.jp/productsearch?filter=" + str(filter) + "&q=" + str(
    413                         keyword) + "&page=" + str(i + 1)
    414             else:
    415                 # https://sellercentral.amazon.com/productsearch?filter=pets&q=dog
    416                 openurl = "https://sellercentral.amazon.com/productsearch?filter=" + str(filter) + "&q=" + str(
    417                         keyword) + "&page=" + str(i + 1)
    418 
    419             print("开始抓取:" + str(openurl))
    420             openhtml = gethtml(openurl)
    421 
    422             # BeautifulSoup解析需要的东西
    423             soups = BeautifulSoup(openhtml, "html.parser")
    424             # 筛选出商品的div
    425             sellyours = soups.findAll('div', attrs={'class': "product"})
    426 
    427             if 'ap_email' in openhtml or "Amazon.com Page Not Found" in openhtml:
    428                 print("抓取得太快!需要重新登陆...")
    429                 openbrowser(Loginurl)
    430                 openhtml = gethtml(openurl)
    431 
    432             elif sellyours == None:
    433                 print("已经翻到最后一页了...")
    434                 break
    435             temparr = getinfo(openhtml, Loginurl)
    436         except Exception as err:
    437             print(err)
    438             print("访问抓取过程中出现小错误...")
    439             print("暂停20秒记录bug并尝试自我修复...")
    440             time.sleep(20)
    441 
    442         if temparr:
    443             information.append(temparr[0])
    444         loadtime = random.randint(5, 10)
    445         print("防止反爬虫设定暂停" + str(loadtime) + "秒...")
    446         time.sleep(loadtime)
    447 
    448     print("抓到的列表如下:")
    449     print(information)
    450 
    451     # 这里写入excel
    452     # 创建文件夹
    453     filename = time.strftime('%Y%m%d', time.localtime())
    454     path = "../xls/" + filename
    455     createjia(path)
    456 
    457     # 写入excel
    458     timename = time.strftime('%Y%H%M%S', time.localtime())
    459     with xlsxwriter.Workbook(path + "/" + timename + '.xlsx') as workbook:
    460         # workbook = xlsxwriter.Workbook(path + "/" + timename + '.xlsx')
    461         worksheet = workbook.add_worksheet()
    462 
    463         first = ['title', 'UPC', 'EAN', 'Rank', 'Nothing', 'ASIN', 'DetailUrl', 'Star', 'Reviews']
    464         # 写入第一行
    465         for i in range(0, len(first)):
    466             worksheet.write(0, i, first[i])
    467         # 写入后面几行
    468         for m in range(0, len(information)):
    469             for n in range(0, len(information[m])):
    470                 insert = str(information[m][n]).replace("UPC: ", "").replace("EAN: ", "").replace("Sales Rank:",
    471                                                                                                   "").replace(
    472                         "customer reviews", "").replace("out of 5 stars", "")
    473                 worksheet.write(m + 1, n, insert)
    474         workbook.close()
    475 
    476     b = time.clock()
    477     print('运行时间:' + timetochina(b - a))
    478     input('请关闭窗口')  ##防止运行完毕后窗口直接关闭而看不到运行时间

    由于selenium库支持低版本的浏览器,例如本文的谷歌浏览器需要下载插件,并将插件放到目录C:Python34即可:

    插件为chromedriver.exe,自己搜索,网上很多哒

  • 相关阅读:
    scala 获取当前时间的两种方式
    log4j配置输出日志文件
    scala读取jar包外配置文件的方式
    scala 异常处理机制
    IDEA 安装scala插件
    第2部分 Elasticsearch查询-请求体查询、排序
    第1部分 Elasticsearch基础
    2.css
    1.html
    Linux搭建Nexus仓库+高可用方案
  • 原文地址:https://www.cnblogs.com/TTyb/p/5979680.html
Copyright © 2020-2023  润新知