爬取目标
1.爬取目标网站:我的博客:https://home.cnblogs.com/u/canglongdao/followers/
爬取内容:爬取我的博客的所有粉丝的名称,并保存到txt
3.由于博客园的登录时需要人机验证的,所以无法直接使用账号自动登录
4.可以先使用selenium代码,在需要输入验证码处,停留几秒,手动验证
5.获取登录成功的cookies,并复制保持登录相关的cookie(字典格式)
代码如下:
# coding:utf-8 from selenium import webdriver import time driver=webdriver.Chrome() driver.get("https://account.cnblogs.com/signin") nlc=driver.get_cookies() print(len(nlc),nlc) driver.find_element_by_id("mat-input-0").send_keys("xxx@qq.com") driver.find_element_by_id("mat-input-1").send_keys("P@ssw0rd") driver.find_element_by_xpath("//span[@class='mat-button-wrapper']").click() time.sleep(6) lc=driver.get_cookies() print(len(lc),lc)
运行结果:
5 [{'domain': 'account.cnblogs.com', 'expiry': 1599210767, 'httpOnly': False, 'name': '4271c12252a544478175bac9772afc3d', 'path': '/', 'secure': False, 'value': '010720fb-f7e4-4f4b-b206-0e991ecf6f5b'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'SERVERID', 'path': '/', 'secure': False, 'value': 'daace45bf36fef87f4742d8b633fdae3|1599208967|1599208966'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Session', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF%2BQEQcGP%2Bk0zUGAelDZwKkZE07wn7bgYbw56biK9%2FwoxKcs%2FmFFb%2B21xAjYxIXXQJeai7NvLoyDfgSr45CxhE9nwRKokI1nqtUdlD5wk2MHtHUO4kIFOTpe9gzKU%2F%2BDs%2B65eSMPAU62bfOS86QdUoNXH5qL'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'XSRF-TOKEN', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcyb192CHuZwpo_t9r1Ps07m_GVYNh15x2atqF3hGcynCnlxxqVFCWmUT5OqBV0zfYfYC3BjZ-7WUDux6AI1xLaMad3ETT6_MyakbxByaS76Nim_y5-i1_oX0aBl2U91xs'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Antiforgery', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQek9XiBoWIQkti8vvTbpqx-CFIWKb39vrCeVudMwHbcPXBWb8LBrlnlM0JzKwWlUlgaD5ioMqre_sd1nEFtrTGhAMmUsVWYxYta1gs4DkuYVinqEL6omAaSnZIJhoxLfp8'}] 5 [{'domain': '.cnblogs.com', 'expiry': 1599295374, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.683535015.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1662280974, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1985506889.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'}, {'domain': '.cnblogs.com', 'expiry': 1599209034, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'}]
添加登录的cookie,并获取粉丝名称
# coding:utf-8 from selenium import webdriver import time driver=webdriver.Chrome() #手动从登陆后,获取到的cookie中,复制如下内容,赋值给c1,c2 c1={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'} c2={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'} driver.get("https://account.cnblogs.com/signin") driver.add_cookie(c1) driver.add_cookie(c2) time.sleep(3) driver.get("https://home.cnblogs.com/u/canglongdao/followers/") f=driver.find_elements_by_xpath("//div[@class='avatar_list']/ul/li/a") result=[] for i in f: name=i.get_attribute("title") print(name) result.append(name) print(result)
运行结果:
['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', '']
将粉丝名称写入.txt文档
# rs=['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', ''] # print(rs[::2]) for i in result[::2]: with open("a.txt","a") as f: #追加#无a.txt,则新建;a.txt存在,则追加 f.write(i+" ")
参考代码:
# coding:utf-8 from selenium import webdriver import time driver=webdriver.Chrome() # driver.get("https://account.cnblogs.com/signin") # nlc=driver.get_cookies() # print(len(nlc),nlc) # driver.find_element_by_id("mat-input-0").send_keys("xxx@qq.com") # driver.find_element_by_id("mat-input-1").send_keys("P@ssw0rd") # driver.find_element_by_xpath("//span[@class='mat-button-wrapper']").click() # time.sleep(6) # lc=driver.get_cookies() # print(len(lc),lc) # 5 [{'domain': 'account.cnblogs.com', 'expiry': 1599210767, 'httpOnly': False, 'name': '4271c12252a544478175bac9772afc3d', 'path': '/', 'secure': False, 'value': '010720fb-f7e4-4f4b-b206-0e991ecf6f5b'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'SERVERID', 'path': '/', 'secure': False, 'value': 'daace45bf36fef87f4742d8b633fdae3|1599208967|1599208966'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Session', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF%2BQEQcGP%2Bk0zUGAelDZwKkZE07wn7bgYbw56biK9%2FwoxKcs%2FmFFb%2B21xAjYxIXXQJeai7NvLoyDfgSr45CxhE9nwRKokI1nqtUdlD5wk2MHtHUO4kIFOTpe9gzKU%2F%2BDs%2B65eSMPAU62bfOS86QdUoNXH5qL'}, {'domain': 'account.cnblogs.com', 'httpOnly': False, 'name': 'XSRF-TOKEN', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcyb192CHuZwpo_t9r1Ps07m_GVYNh15x2atqF3hGcynCnlxxqVFCWmUT5OqBV0zfYfYC3BjZ-7WUDux6AI1xLaMad3ETT6_MyakbxByaS76Nim_y5-i1_oX0aBl2U91xs'}, {'domain': 'account.cnblogs.com', 'httpOnly': True, 'name': '.Cnblogs.Account.Antiforgery', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQek9XiBoWIQkti8vvTbpqx-CFIWKb39vrCeVudMwHbcPXBWb8LBrlnlM0JzKwWlUlgaD5ioMqre_sd1nEFtrTGhAMmUsVWYxYta1gs4DkuYVinqEL6omAaSnZIJhoxLfp8'}] # 5 [{'domain': '.cnblogs.com', 'expiry': 1599295374, 'httpOnly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.683535015.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1662280974, 'httpOnly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1985506889.1599208975'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'}, {'domain': '.cnblogs.com', 'expiry': 1599209034, 'httpOnly': False, 'name': '_gat', 'path': '/', 'secure': False, 'value': '1'}, {'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'}] # # #手动从登陆后,获取到的cookie中,复制如下内容,赋值给c1,c2 c1={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.CNBlogsCookie', 'path': '/', 'secure': False, 'value': 'EB2AB3A42B8CEE723552C8644CAD13CEFA311FA3955FDB7A25A33EEB87199C843967F5791CB012543FD9AC374F535F23C228D4AC5E0373CAA6855768E5713BDF88D82BB97C38A668CDDEB72E0D5055467339189E'} c2={'domain': '.cnblogs.com', 'expiry': 1600504974, 'httpOnly': True, 'name': '.Cnblogs.AspNetCore.Cookies', 'path': '/', 'secure': False, 'value': 'CfDJ8K5MrGQfPjpFvRyctF-QEQcWJIxPwl8mHGJMd1DItl4C_m7X5ixG1-4yGpDWsiv3n9Iung2Yxk7eaHqXJ1rAGWYXeQF3OyXzSXfYHkPc7A7RPLekmvNk0dNucu8ssSF7ldaY1Nqsnx-q9O3U6JpZ_GCz8ed5jwuq1g8V_StxqpEq2ell4jFdrMmgA1GQudbiFYE1aPVcf1Rs5U7xUJ6UjMJijwG3_OAfQJ9DSibuDqYuhvaS0wwbR6OUfQIBI6NFDdwXz5GL0wJZ82wmPjkKKrrX3ADNm1jsdJxb9fceZC2CfDC2aqe-XotiNwzbsA2vhkDpB5m3JOLYA_P7mWfSexjGKs6ii9E2fNjgYgqZA8TG-1CqvApZjzkCgWklntSP71W5Xrc8zSNkRPiSuoMEKtzVecH65t9utYA2ZneK-mVParwkydH3_hcx1l03CYj6p7HP33S5MsWtvDagWN3waRPfRtdUx2KTDUTKl0Rpt-Gb1cL8RWSctfQxrg8gGKmWYwGqoPhLcDmtPc7D1C6EmZaxp61YODRup2mIzFdRdCvoU8F3Ll9Tsgb8ja7gHra03g'} driver.get("https://account.cnblogs.com/signin") driver.add_cookie(c1) driver.add_cookie(c2) time.sleep(3) driver.get("https://home.cnblogs.com/u/canglongdao/followers/") f=driver.find_elements_by_xpath("//div[@class='avatar_list']/ul/li/a") result=[] for i in f: name=i.get_attribute("title") print(name) result.append(name) print(result) # rs=['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', ''] # print(rs[::2]) for i in result[::2]: with open("a.txt","a") as f: f.write(i+" ")
运行结果:
偏爱也例外 岑欢 NiuBiBoy! 知识在于点滴的积累 浅唱蛰伏 linofficer 龙骑士大哥 给明天的自己 小熊软糖 ['偏爱也例外', '', '岑欢', '', 'NiuBiBoy!', '', '知识在于点滴的积累', '', '浅唱蛰伏', '', 'linofficer', '', '龙骑士大哥', '', '给明天的自己', '', '小熊软糖', '']