python102-爬虫

# 昨日回顾

```python
# 1 requests+bs4爬取新闻
# 2 使用实例化得到一个对象（解析的字符串，解析器（html.parser,lxml））
# 3 遍历文档树  soup.body   查找速度快，只能找到最开始的第一个
# 4 标签对象有很多属性
    对象.name    # 标签的名字
    对象.attrs   # 标签的所有属性
    对象.get(属性名) # 取标签属性
    对象.text    # 取标签的文本内容（子子孙孙文本拼到一起）
    对象.get_text() # 等同于上面
    对象.string  # 当前标签下有文本才取出否则全是None
    对象.strings #子子孙孙的内容放到生成器中
    对象.子孙，父亲，兄弟（了解）
# 5 搜索文档树
    find # 找到第一个
    find_all # 找到所有
    5中过滤方式：字符串，列表，正则，布尔，方法
    find_all(name='字符串'，id='字符串'，attrs={'class':'字符串'})
    其他四种规则指的是在字符串的位置替换成其他
    limit：限制条数
    是否递归
# 6 css选择器
    -#id
    -.class
    -#id a  # 子子孙孙
    -#id>a  # 直接子节点，子
    soup.select('css选择器')  # 列表
# 7 代理池的搭建（自己搭建爬虫+web框架）
     -读一读人家的代码
# 8 打码平台
    -pytesseract
    -百度文字识别（验证码图片给它，它给你返回）
    -pillow
    -打码平台（收费）
    -使用：sdk--》类--》对象.xx方法（图片传过去）

今日内容

1 爬拉勾网职位信息

#https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false
import requests
#实际要爬取的url
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

payload = {
    'first': 'true',
    'pn': '1',
    'kd': 'python',
}

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Accept': 'application/json, text/javascript, */*; q=0.01'
}
#原始的url
urls ='https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
#建立session
s = requests.Session()
# 获取搜索页的cookies
s.get(urls, headers=header, timeout=3)
# 为此次获取的cookies
cookie = s.cookies
# 获取此次文本
response = s.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text
print(response)

2 爬红楼梦小说

#http://www.shicimingju.com/book/hongloumeng.html

import requests

from bs4 import BeautifulSoup
ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
# print(ret.text)

soup=BeautifulSoup(ret.text,'lxml')
li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
with open('hlm.txt','w',encoding='utf-8') as f:
    for li in li_list:
        content=li.find('a').text
        url='https://www.shicimingju.com'+li.find('a').get('href')

        f.write(content)
        f.write('
')
        res_content=requests.get(url)
        soup2=BeautifulSoup(res_content.text,'lxml')
        content_detail=soup2.find(class_='chapter_content').text
        f.write(content_detail)
        f.write('
')
        print(content,'写入了')

3 爬肯德基门店

# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
import requests

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
data = {
    'cname': '',
    'pid': 20,
    'keyword': '浦东',
    'pageIndex': 1,
    'pageSize': 10
}
ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
print(ret.json())

4 爬糗事百科段子

#https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)

soup=BeautifulSoup(ret.text,'html.parser')

article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
    content=article.find(class_='content').text
    print(content)
    print('-------')

5 xpath选择器使用

# xpath: XPath 是一门在 XML 文档中查找信息的语言
# / :从根节点选取。
# // :不管位置，直接找
# /@属性名
# /text()
# 会复制（）

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' aa='bb'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

html=etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有节点
# a=html.xpath('//*')

# 2 指定节点（结果为列表）
# a=html.xpath('//head')

# 3 子节点，子孙节点
# a=html.xpath('//div/a')
# a=html.xpath('//body/a') #无数据
# a=html.xpath('//body//a')


# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a[1]/..')
# 也可以这样
# a=html.xpath('//body//a[1]/parent::*')



# 5 属性匹配
# a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取(重要)  /text() 取当前标签的文本
# a=html.xpath('//body//a[@href="image1.html"]/text()')
# a=html.xpath('//body//a/text()')

# 7 属性获取  @href 取当前标签的属性
# a=html.xpath('//body//a/@href')

# # 注意从1 开始取（不是从0）
# a=html.xpath('//body//a[1]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//body//a[@class="li"]')
# a=html.xpath('//body//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# 取最后一个
# a=html.xpath('//a[last()]/@href')
# 位置小于3的
# a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
# a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# attribute：属性值
# a=html.xpath('//a[1]/attribute::*')
# a=html.xpath('//a[1]/@aa')
# child：直接子节点
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/child::img/@src')
# descendant：所有子孙节点
# a=html.xpath('//a[6]/descendant::*')
# a=html.xpath('//a[6]/descendant::h5/text()')
# following:当前节点之后所有节点(兄弟节点和兄弟内部的节点)
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点（只找兄弟）
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')


print(a)


# /
# //
# /@属性名
# /text()

//以后去查找标签，bs4的find，     css，xpath（通用的）

6 selenium使用

# 为了解决requests无法直接执行JavaScript代码的问题 
# 


# pip3 install selenium


# 浏览器驱动:http://npm.taobao.org/mirrors/chromedriver/
# 驱动要跟浏览器版本对应  84.0.4147.105：驱动用84.0.4147.30/
# 下载完解压就是个exe（不同平台的可执行文件）
# from selenium import webdriver
# import time
# # bro=webdriver.Chrome()  # 得到一个谷歌浏览器对象，
# # 指定使用跟那个驱动
# bro=webdriver.Chrome(executable_path='./chromedriver.exe') # 得到一个谷歌浏览器对象，
#
# time.sleep(2)
# bro.get('https://www.baidu.com/')  # 在地址栏里输入了百度
# time.sleep(2)
# print(bro.page_source)
# time.sleep(2)
# bro.close()



# 模拟登陆百度
# from selenium import webdriver
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver.exe')
#
# bro.get('https://www.baidu.com/')
# time.sleep(0.01)
# input_k=bro.find_element_by_id('kw')
# input_k.send_keys('美女')  # 在框里写入美女
# time.sleep(2)
# sou=bro.find_element_by_id('su')  # 找到搜索按钮
# sou.click() # 点击搜索按钮
# time.sleep(4)
# bro.close()


# from selenium import webdriver
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.implicitly_wait(5)  # 隐士等待：找一个控件，如果控件没有加载出来，等待5s中  等待所有，只需要写着一句，以后找所有控件都按这个操作来
# bro.get('https://www.baidu.com/')
#
# d_button=bro.find_element_by_link_text('登录')
#
# d_button.click()
#
# login_u=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
# login_u.click()
#
# username=bro.find_element_by_id('TANGRAM__PSP_11__userName')
# username.send_keys('yxp654799481')
# password=bro.find_element_by_id('TANGRAM__PSP_11__password')
# password.send_keys('yxp997997')
# time.sleep(3)
# submit=bro.find_element_by_id('TANGRAM__PSP_11__submit')
#
# submit.click()
# time.sleep(10)
#
# print(bro.get_cookies())
#
# bro.close()



# ##############选择器（find系列）
# ===============所有方法===================
# 1、find_element_by_id   # 通过id查找控件
# 2、find_element_by_link_text  # 通过a标签内容找
# 3、find_element_by_partial_link_text  # 通过a标签内容找，模糊匹配
# 4、find_element_by_tag_name   # 标签名
# 5、find_element_by_class_name  # 类名
# 6、find_element_by_name      # name属性
# 7、find_element_by_css_selector  # 通过css选择器
# 8、find_element_by_xpath       # 通过xpaht选择器
# 强调：

# 1、find_elements_by_xxx的形式是查找到多个元素，结果为列表




# 获取元素属性
# 重点
# tag.get_attribute('href')  # 找当前控件 的href属性对的值
# tag.text   # 获取文本内容

# 了解
# print(tag.id)   # 当前控件id号
# print(tag.location)  # 当前控件在页面位置
# print(tag.tag_name)  # 标签名
# print(tag.size)      #标签的大小



####无界面浏览器（phantomjs）
#谷歌浏览器支持不打开页面
# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver
# chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
# chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
#
#
# chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
#
#
# bro=webdriver.Chrome(chrome_options=chrome_options,executable_path='./chromedriver.exe')
# bro.get('https://www.baidu.com/')
# print(bro.page_source)
# bro.close()


######元素交互
# tag.send_keys()  # 往里面写内容
# tag.click()      # 点击控件
# tag.clear()      # 清空控件内容

#####执行js(有什么用?)

# from selenium import webdriver
# import time
# bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.implicitly_wait(5)  # 隐士等待：找一个控件，如果控件没有加载出来，等待5s中  等待所有，只需要写着一句，以后找所有控件都按这个操作来
# bro.get('https://www.baidu.com/')
#
#
# bro.execute_script('window.open()')
# bro.execute_script('window.open()')
# time.sleep(2)
# bro.close()


####模拟浏览器前进后退

# from selenium import webdriver
# import time
# browser=webdriver.Chrome(executable_path='./chromedriver.exe')
# browser.get('https://www.baidu.com')
# browser.get('https://www.taobao.com')
# browser.get('http://www.sina.com.cn/')
#
# browser.back()
# time.sleep(1)
# browser.forward()
#
# browser.close()


#####获取cookie
# bro.get_cookies()



#### 选项卡管理(了解)
# from selenium import webdriver
# import time
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.execute_script('window.open()')
#
# print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.get('https://www.taobao.com')
# time.sleep(2)
# browser.switch_to_window(browser.window_handles[0])
# browser.get('https://www.sina.com.cn')
# browser.close()



##### 异常处理
# from selenium import webdriver
# from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
# browser=webdriver.Chrome()
# try:
#
#     browser.get('')
# except Exception as e:
#     print(e)
# finally:
#     # 无论是否出异常，最终都要关掉
#     browser.close()



#####动作链（）


#### 如何把屏幕拉倒最后（js控制）

# bro.execute_script('window.scrollTo(0,document.body.offsetHeight)')

7 爬取京东商品信息


from selenium import webdriver
import time
# 模拟键盘输入
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# 设置隐士等待
bro.implicitly_wait(10)

def get_goods_info(bro):
    # li_list=bro.find_element_by_class_name('gl-warp').find_elements_by_tag_name('li')
    # goods=bro.find_elements_by_class_name('gl-item')
    goods = bro.find_elements_by_css_selector('.gl-item')
    # print(len(goods))
    for good in goods:
        try:
            price = good.find_element_by_css_selector('.p-price i').text
            name = good.find_element_by_css_selector('.p-name em').text
            url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
            commits = good.find_element_by_css_selector('.p-commit strong>a').text
            photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src')

            print('''
            商品名字：%s
            商品价格：%s
            商品地址：%s
            商品评论数：%s
            商品图片地址：%s
    
            ''' % (name, price, url, commits, photo_url))
        except Exception as e:
            continue

    next_button = bro.find_element_by_partial_link_text('下一页')
    time.sleep(1)
    next_button.click()

    get_goods_info(bro)

try:
    bro.get('https://www.jd.com/')

    input_k=bro.find_element_by_id('key')

    input_k.send_keys('奶牛')
    # 模拟键盘的回车键
    input_k.send_keys(Keys.ENTER)
    get_goods_info(bro)


except Exception as e:
    print(e)

finally:
    bro.close()

8 自动登录12306




from selenium import webdriver
import time
#pillow
from PIL import Image

# 引入超级鹰

from chaojiying import Chaojiying_Client


from selenium.webdriver import ActionChains
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)
try:
    bro.get('https://kyfw.12306.cn/otn/resources/login.html')
    bro.maximize_window()  # 窗口最大化，全屏
    button_z=bro.find_element_by_css_selector('.login-hd-account a')
    button_z.click()
    time.sleep(2)
    # 截取整个屏幕
    bro.save_screenshot('./main.png')
    # 验证码的位置和大小
    img_t=bro.find_element_by_id('J-loginImg')
    print(img_t.size)
    print(img_t.location)

    size=img_t.size
    location=img_t.location

    img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
    # # 抠出验证码
    # #打开
    img = Image.open('./main.png')
    # 抠图
    fram = img.crop(img_tu)
    # 截出来的小图
    fram.save('code.png')

    # 调用超级鹰破解
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')	#用户中心>>软件ID 生成一个替换 96001
    im = open('code.png', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    # print(chaojiying.PostPic(im, 9004))

    ## 返回结果如果有多个 260,133|123，233,处理这种格式[[260,133],[123,233]]
    res=chaojiying.PostPic(im, 9004)
    print(res)
    result=res['pic_str']

    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    print(all_list)
    # 用动作链，点击图片
    # [[260,133],[123,233]]
    for a in all_list:
        x = a[0]
        y = a[1]
        ActionChains(bro).move_to_element_with_offset(img_t, x, y).click().perform()
        time.sleep(1)

    username=bro.find_element_by_id('J-userName')
    username.send_keys('306334678')
    password=bro.find_element_by_id('J-password')
    password.send_keys('lqz12345')
    time.sleep(3)
    submit_login=bro.find_element_by_id('J-login')
    submit_login.click()
    time.sleep(3)

    print(bro.get_cookies())
    time.sleep(10)
    bro.get('https://www.12306.cn/index/')
    time.sleep(5)

except Exception as e:
    print(e)
finally:
    bro.close()

9 cookie池讲解

# 如何搭建cookie池
# selenium写一套（一堆小号），跑起脚本，自动登录，手动参与
# 拿到cookie，放到redis中
# django搭建一个服务：127.0.0.0/get,随机返回一个cookie
# request发送请求爬数据（selenium拿到的cookie），cookie失效

10 抓包工具介绍

# 1 浏览器调试模式
# 2 fiddler，charles(自己研究一下)

拓展

https://www.cnblogs.com/liuqingzheng/articles/9079192.html
    
# 轮询和长轮询
# websocket：channles（django作者写的）

作业

-1 京东商品信息，12306破解
0 selenium爬取拉钩职位信息（爬一页）
1 爬取豆瓣top250电影：https://movie.douban.com/top250
2 爬取虎牙直播间信息：

相关阅读:
ubuntu 可以用 root 登录 ftp
正则表达式匹配中文，英文字母和数字及_的写法！同时控制长度
 HttpUtility.UrlEncode，Server.UrlEncode 的区别
 HttpUtility.UrlEncode 方法 (String) 对 URL 字符串进行编码 NET Framework 4.6 and 4.5
C# 多线程task
NET Framework 4.5 有更加简便的方法 Task.Run()
Tuple<int, int> Dictionary<string, object>妙用
 sha1加密
 MD5和sha1加密算法--散列加密技术 MD5：128bit的大整数
 3、Task.Factory属性
原文地址：https://www.cnblogs.com/zdw20191029/p/14553266.html