本文基于python3.4的selenium库打开浏览器,并将浏览器中的登陆cookie保存到本地,那么下次登陆就可以直接利用cookie了:
1 # !/usr/bin/python3.4 2 # -*- coding: utf-8 -*- 3 4 from selenium import webdriver 5 import time 6 import requests 7 from bs4 import BeautifulSoup 8 import os 9 import re 10 import random 11 import xlsxwriter 12 13 14 # 找出文件夹下所有xml后缀的文件,可选择递归 15 def listfiles(rootdir, prefix='.xml', iscur=False): 16 file = [] 17 for parent, dirnames, filenames in os.walk(rootdir): 18 if parent == rootdir: 19 for filename in filenames: 20 if filename.endswith(prefix): 21 file.append(filename) 22 if not iscur: 23 return file 24 else: 25 if iscur: 26 for filename in filenames: 27 if filename.endswith(prefix): 28 file.append(filename) 29 else: 30 pass 31 return file 32 33 34 # 抓取dp的正则表达式 35 def getdp(string): 36 reg = r'(http.+?/dp/)(.+)' 37 all = re.compile(reg) 38 alllist = re.findall(all, string) 39 return alllist[0][1] 40 41 42 # 抓取filter的正则表达式 43 # https://sellercentral.amazon.com/productsearch?filter=grocery&q=fish 44 def getfilter(string): 45 reg = r'(https.+?filter=)(.+?)(&)' 46 all = re.compile(reg) 47 alllist = re.findall(all, string) 48 return alllist[0][1] 49 50 51 # 抓取最大页数的正则 52 def getpagenum(string): 53 reg = r'(.+?()(d+)())' 54 all = re.compile(reg) 55 alllist = re.findall(all, string) 56 return alllist[0][1] 57 58 59 # 创建文件夹 60 def createjia(path): 61 try: 62 os.makedirs(path) 63 except: 64 pass 65 66 67 def timetochina(longtime, formats='{}天{}小时{}分钟{}秒'): 68 day = 0 69 hour = 0 70 minutue = 0 71 second = 0 72 try: 73 if longtime > 60: 74 second = longtime % 60 75 minutue = longtime // 60 76 else: 77 second = longtime 78 if minutue > 60: 79 hour = minutue // 60 80 minutue = minutue % 60 81 if hour > 24: 82 day = hour // 24 83 hour = hour % 24 84 return formats.format(day, hour, minutue, second) 85 except: 86 raise Exception('时间非法') 87 88 89 # 打开浏览器抓取cookie 90 def openbrowser(url): 91 # 打开谷歌浏览器 92 # Firefox() Chrome() 93 browser = webdriver.Chrome() 94 # browser = webdriver.Chrome(executable_path='C:/Python34/chromedriver.exe') 95 # 输入网址 96 browser.get(url) 97 # 打开浏览器时间 98 # print("等待10秒打开浏览器...") 99 # time.sleep(10) 100 101 # 找到id="ap_email"的对话框 102 # 清空输入框 103 browser.find_element_by_id("ap_email").clear() 104 browser.find_element_by_id("ap_password").clear() 105 106 # 输入账号密码 107 inputemail = input("请输入账号:") 108 inputpassword = input("请输入密码:") 109 browser.find_element_by_id("ap_email").send_keys(inputemail) 110 browser.find_element_by_id("ap_password").send_keys(inputpassword) 111 112 # 点击登陆sign in 113 # id="signInSubmit" 114 browser.find_element_by_id("signInSubmit").click() 115 116 # 等待登陆10秒 117 # print('等待登陆10秒...') 118 # time.sleep(10) 119 print("等待网址加载完毕...") 120 121 select = input("请观察浏览器网站是否已经登陆(y/n):") 122 while 1: 123 if select == "y" or select == "Y": 124 print("登陆成功!") 125 # 获取cookie 126 cookie = [item["name"] + ":" + item["value"] for item in browser.get_cookies()] 127 cookiestr = ';'.join(item for item in cookie) 128 print("正在复制网页cookie...") 129 130 # 写入本地txt 131 if "jp" in url: 132 path = "../data/Japcookie.txt" 133 else: 134 path = "../data/Amecookie.txt" 135 136 filecookie = open(path, "w") 137 filecookie.write(cookiestr) 138 filecookie.close() 139 140 time.sleep(1) 141 print("准备关闭浏览器...") 142 browser.quit() 143 # print(cookiestr) 144 break 145 146 elif select == "n" or select == "N": 147 selectno = input("账号密码错误请按0,验证码出现请按1...") 148 # 账号密码错误则重新输入 149 if selectno == "0": 150 151 # 找到id="ap_email"的对话框 152 # 清空输入框 153 browser.find_element_by_id("ap_email").clear() 154 browser.find_element_by_id("ap_password").clear() 155 156 # 输入账号密码 157 inputemail = input("请输入账号:") 158 inputpassword = input("请输入密码:") 159 browser.find_element_by_id("ap_email").send_keys(inputemail) 160 browser.find_element_by_id("ap_password").send_keys(inputpassword) 161 # 点击登陆sign in 162 # id="signInSubmit" 163 browser.find_element_by_id("signInSubmit").click() 164 165 elif selectno == "1": 166 # 验证码的id为id="ap_captcha_guess"的对话框 167 input("请在浏览器中输入验证码并登陆...") 168 select = input("请观察浏览器网站是否已经登陆(y/n):") 169 170 else: 171 print("请输入“y”或者“n”!") 172 select = input("请观察浏览器网站是否已经登陆(y/n):") 173 174 return cookiestr 175 176 177 def gethtml(url): 178 # 读取cookie 179 # 写入字典 180 mycookie = {} 181 if "jp" in url: 182 path = "../data/Japcookie.txt" 183 else: 184 path = "../data/Amecookie.txt" 185 186 try: 187 filecookie = open(path, "r") 188 cookies = filecookie.read().split(";") 189 for items in cookies: 190 item = items.split(":") 191 mycookie[item[0]] = item[1] 192 # print(mycookie) 193 filecookie.close() 194 except: 195 print("cookie为空...") 196 197 if "jp" in url: 198 referer = "https://sellercentral.amazon.co.jp/" 199 host = "www.amazon.co.jp" 200 else: 201 referer = "https://sellercentral.amazon.com/" 202 host = "www.amazon.com" 203 204 # 制作头部 205 header = { 206 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0', 207 'Referer': referer, 208 'Host': host, 209 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 210 'Connection': 'keep-alive', 211 'Upgrade-Insecure-Requests': '1', 212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 213 'Accept-Encoding': 'gzip, deflate, br' 214 } 215 216 htmlget = requests.get(url=url, headers=header, cookies=mycookie, timeout=60) 217 htmlcontent = htmlget.content.decode("UTF-8", "ignore") 218 219 return htmlcontent 220 221 222 def getinfo(html, Loginurl): 223 # BeautifulSoup解析需要的东西 224 soups = BeautifulSoup(html, "html.parser") 225 # 筛选出商品的div 226 sellyours = soups.find_all("div", attrs={"class": "a-box product"}) 227 information = [] 228 for item in sellyours: 229 # 一个一个商品筛选 230 # 第一次筛选,得到有“出售您的”的商品列表 231 temp = item.find("a", attrs={"class", "a-button-text"}) 232 233 if temp != None: 234 if "sellYoursClick" in temp["data-csm"]: 235 # 第二次筛选得到“无数字、无新品”字样的商品列表 236 temp = item.find("span", attrs={"class", "offerCountDetails"}) 237 if temp == None: 238 temp = item.find("div", attrs={"class", "a-fixed-right-grid-col description a-col-left"}) 239 240 # 得到详情页网址 241 hrefurl = temp.find('a').get('href') 242 # 得到所有当前class下的文本信息 243 # 包括title、UPC、EAN、Rank 244 try: 245 spans = temp.get_text() 246 except: 247 spans = "Nothing" 248 # 将得到的文本信息写入数组里面 249 temparr = spans.strip().split(" ") 250 # 正则得到Asin 251 asin = getdp(hrefurl) 252 temparr.append(asin) 253 temparr.append(hrefurl) 254 255 # 这里记录一份副本到txt中,防止程序中断什么都没保存 256 txtcontent = ' '.join(temparr) 257 filename = time.strftime('%Y%m%d', time.localtime()) 258 path = "../xls/" + filename 259 createjia(path) 260 file = open(path + "/" + filename + ".txt", "a") 261 file.write(" " + txtcontent) 262 file.close() 263 264 # 这里是解析详情页,如果详情页有price,就抓取review下来 265 # 并且将抓取的东西储存到数组,并写入excel中 266 # 解析详情页 267 htmldetail = gethtml(hrefurl) 268 269 if 'id="words"' in htmldetail or 'ap_email' in htmldetail or "Amazon.com Page Not Found" in htmldetail: 270 print("抓取得太快!需要重新登陆...") 271 openbrowser(Loginurl) 272 htmldetail = gethtml(hrefurl) 273 274 # BeautifulSoup解析需要的东西 275 soups = BeautifulSoup(htmldetail, "html.parser") 276 # 筛选出商品的centerCol列表 277 centerCols = soups.findAll('div', attrs={'id': "centerCol"}) 278 if centerCols: 279 for item in centerCols: 280 temp = item.find("td", attrs={"id": "priceblock_ourprice_lbl"}) 281 if temp == None: 282 # 得到评分等级 283 star = item.find("a", attrs={"id": "reviewStarsLinkedCustomerReviews"}).get_text() 284 # 得到评分人数 285 reviews = item.find("span", attrs={"id": "acrCustomerReviewText"}).get_text() 286 # 将抓取的东西写入数组 287 if star: 288 temparr.append(star.strip().replace(" out of 5 stars", "")) 289 else: 290 temparr.append("") 291 if reviews: 292 temparr.append(reviews.strip().replace(" customer reviews", "")) 293 else: 294 temparr.append("") 295 296 information.append(temparr) 297 print(information) 298 else: 299 temparr.append("") 300 temparr.append("") 301 information.append(temparr) 302 print(information) 303 return information 304 305 306 def begin(): 307 taoyanbai = ''' 308 ----------------------------------------- 309 | 欢迎使用后台爬虫系统 | 310 | 时间:2016年10月21日 | 311 | 出品:技术部 | 312 ----------------------------------------- 313 ''' 314 print(taoyanbai) 315 316 317 if __name__ == "__main__": 318 319 a = time.clock() 320 321 while 1: 322 try: 323 LoginWhere = int(input("抓取美国请按0,日本请按1:")) 324 if LoginWhere == 0: 325 Loginurl = "https://sellercentral.amazon.com/" 326 break 327 elif LoginWhere == 1: 328 Loginurl = "https://sellercentral.amazon.co.jp/" 329 break 330 except: 331 print("请正确输入0或1!!") 332 LoginWhere = int(input("抓取美国请按0,日本请按1:")) 333 334 keywords = input("请输入查找的关键词:") 335 keyword = keywords.replace(" ", "+") 336 337 print("正在检查登陆状态...") 338 339 if "jp" in Loginurl: 340 seekurl = "https://sellercentral.amazon.co.jp/productsearch?q=" + str(keyword) 341 else: 342 seekurl = "https://sellercentral.amazon.com/productsearch?q=" + str(keyword) 343 344 try: 345 htmlpage = gethtml(seekurl) 346 except Exception as err: 347 input("网络似乎有点问题...") 348 print(err) 349 exit() 350 351 while 1: 352 if 'ap_email' in htmlpage or "Amazon.com Page Not Found" in htmlpage or "<title>404" in htmlpage: 353 print("cookie已经过期,需要重新登陆...") 354 print("等待网页打开...") 355 openbrowser(Loginurl) 356 htmlpage = gethtml(seekurl) 357 else: 358 print("直接使用cookie登陆...") 359 break 360 361 # BeautifulSoup解析需要的东西 362 soups = BeautifulSoup(htmlpage, "html.parser") 363 # 筛选出类别及其网址 364 categorys = soups.findAll('ul', attrs={'class': "a-nostyle a-vertical"}) 365 categoryurl = [] 366 categoryname = "" 367 pagenum = [] 368 filtername = [] 369 370 for item in categorys: 371 for temp in item.find_all("a"): 372 hrefurl = temp.get('href') 373 categoryurl.append(hrefurl) 374 375 for temp in item.find_all("span", attrs={"class", "a-color-tertiary"}): 376 spantext = temp.get_text() 377 pagenum.append(getpagenum(spantext)) 378 for i in range(0, len(categoryurl)): 379 name = getfilter(categoryurl[i]) 380 filtername.append(name) 381 categoryname = categoryname + "抓取(" + str(name) + ")请按" + str(i) + "," 382 383 # 选择抓取的类型 384 try: 385 print(categoryname) 386 selectcategory = int(input("请选择你要抓取类型的数字号码:")) 387 except: 388 print("请正确输入前面的数字!!!") 389 print(categoryname) 390 selectcategory = int(input("请选择你要抓取类型的数字编码:")) 391 392 filter = filtername[selectcategory] 393 mustpage = int(pagenum[selectcategory]) // 10 394 395 try: 396 print("温馨提醒:(1)后台仅仅展现1000页...(2)你要抓取的类型大约有" + str(mustpage) + "页...") 397 page = int(input("请问你要抓取多少页?(默认15页):")) 398 if page > 1000: 399 print("后台最多只能看到1000页!!!") 400 page = int(input("后台仅仅展现1000页!!!你要抓取的类型大约有" + str(mustpage) + "页!!!请问你要抓取多少页?(默认15页):")) 401 except: 402 page = 15 403 404 # 储存抓取到的东西 405 information = [] 406 temparr = [] 407 408 for i in range(0, page): 409 try: 410 if "jp" in Loginurl: 411 # https://sellercentral.amazon.co.jp/productsearch?filter=sporting&q=空気入れ&page=2 412 openurl = "https://sellercentral.amazon.co.jp/productsearch?filter=" + str(filter) + "&q=" + str( 413 keyword) + "&page=" + str(i + 1) 414 else: 415 # https://sellercentral.amazon.com/productsearch?filter=pets&q=dog 416 openurl = "https://sellercentral.amazon.com/productsearch?filter=" + str(filter) + "&q=" + str( 417 keyword) + "&page=" + str(i + 1) 418 419 print("开始抓取:" + str(openurl)) 420 openhtml = gethtml(openurl) 421 422 # BeautifulSoup解析需要的东西 423 soups = BeautifulSoup(openhtml, "html.parser") 424 # 筛选出商品的div 425 sellyours = soups.findAll('div', attrs={'class': "product"}) 426 427 if 'ap_email' in openhtml or "Amazon.com Page Not Found" in openhtml: 428 print("抓取得太快!需要重新登陆...") 429 openbrowser(Loginurl) 430 openhtml = gethtml(openurl) 431 432 elif sellyours == None: 433 print("已经翻到最后一页了...") 434 break 435 temparr = getinfo(openhtml, Loginurl) 436 except Exception as err: 437 print(err) 438 print("访问抓取过程中出现小错误...") 439 print("暂停20秒记录bug并尝试自我修复...") 440 time.sleep(20) 441 442 if temparr: 443 information.append(temparr[0]) 444 loadtime = random.randint(5, 10) 445 print("防止反爬虫设定暂停" + str(loadtime) + "秒...") 446 time.sleep(loadtime) 447 448 print("抓到的列表如下:") 449 print(information) 450 451 # 这里写入excel 452 # 创建文件夹 453 filename = time.strftime('%Y%m%d', time.localtime()) 454 path = "../xls/" + filename 455 createjia(path) 456 457 # 写入excel 458 timename = time.strftime('%Y%H%M%S', time.localtime()) 459 with xlsxwriter.Workbook(path + "/" + timename + '.xlsx') as workbook: 460 # workbook = xlsxwriter.Workbook(path + "/" + timename + '.xlsx') 461 worksheet = workbook.add_worksheet() 462 463 first = ['title', 'UPC', 'EAN', 'Rank', 'Nothing', 'ASIN', 'DetailUrl', 'Star', 'Reviews'] 464 # 写入第一行 465 for i in range(0, len(first)): 466 worksheet.write(0, i, first[i]) 467 # 写入后面几行 468 for m in range(0, len(information)): 469 for n in range(0, len(information[m])): 470 insert = str(information[m][n]).replace("UPC: ", "").replace("EAN: ", "").replace("Sales Rank:", 471 "").replace( 472 "customer reviews", "").replace("out of 5 stars", "") 473 worksheet.write(m + 1, n, insert) 474 workbook.close() 475 476 b = time.clock() 477 print('运行时间:' + timetochina(b - a)) 478 input('请关闭窗口') ##防止运行完毕后窗口直接关闭而看不到运行时间
由于selenium库支持低版本的浏览器,例如本文的谷歌浏览器需要下载插件,并将插件放到目录C:Python34即可:
插件为chromedriver.exe,自己搜索,网上很多哒