• 爬虫使用-简单(python3入门)



    爬虫是什么?
    - 每个网站都有爬虫协议,(例如:https://www.baidu.com/robots.txt,这里会写清楚哪些允许 哪些不被允许)
    - 可见即可爬(技术上)
    - 违法的:擦边球

    一、request模块(模拟发请求的模块)

    - 安装:pip3 install requests.  --- urllib,urllib2 (这两个是py内置的),requests模块是基于这两个模块封装的
    
    # **** 基本使用 ****
    # 导入模块
    # import requests
    #
    # # 发送get请求,有返回结果
    # res = requests.get('https://www.baidu.com')
    #
    # # 请求回来的内容
    # print(res.text)
    #
    # with open('a.html','w',encoding='utf-8') as f:
    #     f.write(res.text)
    #
    #
    # # 请求返回的状态码
    # print(res.status_code)
    requests模块介绍
    # **** 携带参数 中文需要编码****
    # import requests
    # from urllib.parse import urlencode
    #
    # key = input('请输入要搜索的内容')
    # # 如果携带参数是中文或者其他特殊字符要做转码
    # key_search = urlencode({'wd':key})
    # # print(key_search)
    #
    # # url = 'https://www.baidu.com/s?%s'%key_search
    # url = 'https://www.baidu.com/s?%s'%key_search
    #
    #
    # # 反扒之一:携带http头信息 user-agent
    # res = requests.get(url,
    #                    headers={
    #                        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    #                    }
    #
    #                    )
    #
    #
    #
    #
    #
    #
    # with open('a.html','w',encoding='utf-8') as f:
    #     f.write(res.text)
    request简单使用-1
    # 每次编码比较复杂,直接使用requests模块的参数
    # import requests
    #
    # key = input('请输入要搜索的内容')
    #
    # # 反扒之一:携带http头信息 user-agent
    # res = requests.get('https://www.baidu.com/s',
    #                    headers={
    #                        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    #                    },
    #                    # get形式携带的参数
    #                    params={'wd':key,'pn':70}
    #                    )
    #
    # print(res.text)
    #
    # with open('a.html','w',encoding='utf-8') as f:
    #     f.write(res.text)
    
    
    # cookie
    # import requests
    #
    # Cookies={   'user_session':'2OjTrCoCXigz5gB7trCsYfMBl0jQ-abqjXdCcas9UqwVmD7y',
    # }
    #
    # response=requests.get('https://github.com/settings/emails',
    #              cookies=Cookies) #github对请求头没有什么限制,我们无需定制user-agent,对于其他网站可能还需要定制
    #
    #
    # print('lich_qiu@163.com' in response.text) #True
    requests简单使用-2
    - get参数介绍
        params = 字典(get形式传的参数)
        headers = 字典()
            - User-Agent : 客户端类型
            - Referer : 从哪个地址调过来的(上一个地址),图片防盗链
            - Host : 
            - Cookie : '字符串'
        Cookie : {'user_session':'xxx'} 因为cookie比较特殊,经常用的到。正常应该放在请求头当中,requests模块单独处理了cookie
    
    - post参数介绍
        params
        headers
        cookie
        data:请求体的数据,默认用urlencoded格式
        json:传字典,这样发送的请求编码格式是: 'content-type': 'application/json'
        allow_redirect = False 是否允许重定向,默认是True,一般不会去更改。
    
    
    
    # 第一步:向https://github.com/login 发送get请求,
    # import requests
    # import re
    #
    # res_login = requests.get('https://github.com/login',
    #                          headers={
    #                              'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    #                          }
    #                          )
    # # print(res_login.text)
    #
    # #   返回数据中有个authenticity_token 取出来
    # #   re.S 把这个字符串当作一行
    # authenticity_token = re.findall(r'name="authenticity_token".*?value="(.*?)"', res_login.text, re.S)[0]
    # print(authenticity_token)
    #
    # #   取出没有登入的cookie
    # login_cookie = res_login.cookies.get_dict()
    # print(login_cookie)
    #
    # # 第二步:向https://github.com/session 携带用户名+密码并发送post请求
    #
    # data = {
    #     'commit': 'Sign in',
    #     'utf8': '✓',
    #     'authenticity_token': authenticity_token,
    #     'login': 'lich_qiu@163.com',
    #     'password': 'zhang319!',
    #     'webauthn-support': 'supported'
    # }
    #
    # res = requests.post(url='https://github.com/session ',
    #
    #                     # 请求体的数据
    #                     data=data,
    #                     # 需要携带没有通过认证的cookie
    #                     cookies=login_cookie,
    #
    #                     )
    #
    # #   正常登入成功,返回cookie,取出cookie,下次发请求,携带着cookie
    # #   res.cookies.get_dict() 把返回的cookie转成字典
    # res_cookie = res.cookies.get_dict()
    # print(res_cookie)
    #
    # # 第三步:访问https://github.com/settings/emails ,判断lich_qiu@163.com 是否在返回的数据中
    #
    # response = requests.get('https://github.com/settings/emails',
    #                         headers={
    #                             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    #                             'Referer': 'https://github.com/settings/profile'
    #                         },
    #                         cookies=res_cookie,
    #
    #                         )  # github对请求头没有什么限制,我们无需定制user-agent,对于其他网站可能还需要定制
    #
    # print('lich_qiu@163.com' in response.text)  # True
    get和post介绍及简单使用
    # 编码问题
    # import requests
    # response = requests.get('http://www.autohome.com/news')
    #
    # # 当前页面编码方式
    # print(response.apparent_encoding)
    #
    # # 将编码方式改成gbk
    # response.encoding = 'gbk'
    # print(response.text)
    编码问题
    # 爬图片
    # 比较小的文件,可以一次性把content爬下来
    
    # import requests
    # res = requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1556732811646&di=2bd8396b35047f33fbcd6b023f5787b7&imgtype=0&src=http%3A%2F%2Fs15.sinaimg.cn%2Fmw690%2F0066UWNtgy6Viz3mEBoce%26690')
    #
    # with open('a.jpg','wb') as f:
    #     f.write(res.content)
    爬图片
    # 爬视频
    # 文件较大,有个iter_content()方法来循环爬
    
    # import requests
    #
    # res = requests.get('http://static.yximgs.com/s1/videos/www_main-059ce9beee.mp4')
    # with open('a.mp4','wb')as f:
    #     for i in res.iter_content():
    #         f.write(i)
    爬视频
    #解析json
    # import requests
    #
    # response = requests.get('http://httpbin.org/get')
    #
    # import json
    # res1 = json.loads(response.text)    # 太麻烦
    #
    # res2 = response.json()  #直接获取json数据
    #
    # print(res1 == res2)   #结果是True,结果一致
    解析json
    - 响应response
        print(respone.text) --- 输出文本的内容
        print(respone.content) --- 输出二进制的内容
    
        print(respone.status_code) --- 状态码
        print(respone.headers) --- 响应头
        print(respone.cookies) --- 返回的cookie
        print(respone.cookies.get_dict()) --- 把返回的cookie转成字典格式
        print(respone.cookies.items()) --- 字典.items()
    
        print(respone.url) --- 要重定向的地址
        print(respone.history) --- 正常返回的数据
    
        print(respone.encoding) --- 返回数据的编码格式
    响应头使用介绍

    二、requests模块高级用法

    # 1 ssl cert verification
        # verify = False 不认证证书
    # import requests
    #携带证书
    # response = requests.get('https://www.12306.cn',
    #                         cert = ('/path/server.crt/path/key'))
    1 ssl cert verification
    # 2 使用代理
    # http代理
    # import requests
    #
    # proxies = {
    #     'http':'http://lich:123@112.85.151.216:9999',    # 带用户密码的代理,@符号前面是用户名与密码
    #     'http':'223.241.116.173:8010',
    #     'https':'https://localhost:8888'
    # }
    #
    # response = requests.get('https://www.12306.cn',proxies=proxies)
    # print(response.status_code)
    2 使用代理
    # socket代理
    # import requests
    # proxies = {
    #     'http':'socks5://lich:123@112.85.151.216:9999',    # 带用户密码的代理,@符号前面是用户名与密码
    #     # 'http':'socks5://223.241.116.173:8010',
    #     # 'https':'socks5://localhost:8888'
    # }
    #
    # response = requests.get('https://www.12306.cn',proxies=proxies)
    #
    # print(response.status_code)
    3 socket代理
    # 超时设置
    # import requests
    # response = requests.get('https://www.12306.cn',timeout = 0.0001)
    4 超时设置
    # 上传文件
    # import requests
    # files = {
    #     'file':open('a.jpg','rb')
    # }
    # response = requests.post('http://httpbin.org/post',files = files)
    # print(response.status_code)
    5 上传文件

    三、爬虫项目案例

    # 单线程爬取
    # import requests
    # import re
    # import os
    #
    # # 通用的通过地址获取页面内容的方法:
    # def get_page(url):
    #     ret = requests.get(url)
    #     if ret.status_code == 200:
    #         return ret.text
    #
    #
    # def parse_res(text):
    #     # <a href="video_1549859" class="vervideo-lilink actplay">
    #     urls = re.findall(r'class="categoryem".*?href="(.*?)" ',text,re.S)
    #     print(urls)
    #     for url in urls:
    #         print(url)
    #         yield 'https://www.pearvideo.com/' + url
    #
    #
    # def parse_detail(text):
    #     # print(text)
    #     movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0]
    #     # print('视频文件的实际地址:',movie_url)
    #     return movie_url
    #
    # def base_dir():
    #     base = os.path.dirname(os.path.abspath(__file__))
    #     return base
    #
    #
    # def download_movie(url):
    #     import time
    #     movie_content = requests.get(url)
    #     file_name = str(time.time())+'.mp4'
    #     with open('%s/download/%s'%(base_dir(),file_name),'wb')as f:
    #         f.write(movie_content.content)
    #
    #
    #
    # if __name__ == '__main__':
    #     res = get_page('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=1')
    #     # res是这个页面的内容
    #     urls = parse_res(res)
    #     for url in urls:
    #         try:
    #             res_detail = get_page(url)
    #             movie_url = parse_detail(res_detail)
    #             download_movie(movie_url)
    #         except Exception as e:
    #             print(e)
    单线程爬梨视频
    # 多线程
    import requests
    import re
    import os
    from concurrent.futures import ThreadPoolExecutor
    
    # 先生成一个有60个线程的池
    pool = ThreadPoolExecutor(60)
    
    
    # 通用的通过地址获取页面内容的方法:
    def get_page(url):
        ret = requests.get(url)
        if ret.status_code == 200:
            return ret.text
    
    
    def parse_res(text):
        #从text中取出上个函数执行的返回结果
        text = text.result()
        urls = re.findall(r'class="categoryem".*?href="(.*?)" ',text,re.S)
        print(urls)
        for url in urls:
            print(url)
            # yield 'https://www.pearvideo.com/' + url
            pool.submit(get_page,'https://www.pearvideo.com/'+url).add_done_callback(parse_detail)
    
    def parse_detail(text):
        # print(text)
        text = text.result()
        movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0]
        # print('视频文件的实际地址:',movie_url)
        pool.submit(download_movie,movie_url)
    
    def base_dir():
        base = os.path.dirname(os.path.abspath(__file__))
        base = os.path.join(base,'download')
        return base
    
    
    def download_movie(url):
        import time
        movie_content = requests.get(url)
        file_name = str(time.time())+'.mp4'
        file = os.path.join(base_dir(),file_name)
        if movie_content.status_code == 200:
            with open(file,'wb')as f:
                f.write(movie_content.content)
    
    
    if __name__ == '__main__':
        for i in range(3):
            url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=%s'%(i*12+1)
            # 把获取页面的方法提交到线程中
            # add_done_callback()当该线程执行完成后,执行的方法
            pool.submit(get_page,url).add_done_callback(parse_res)
    多线程爬梨视频
    #导入 beautifulsoup模块
    from bs4 import BeautifulSoup
    import time
    import os
    
    # https://www.autohome.com.cn/news/2/#liststart
    
    for i in range(1,10):
        url = 'https://www.autohome.com.cn/news/%s/#liststart'%i
        ret = requests.get(url)
        # print(ret.text)
    
    
        # soup = BeautifulSoup(ret.text,'lxml')
        soup = BeautifulSoup(ret.text,'html.parser')
    
        ul = soup.find(name='ul',attrs={'class':'article'})
    
        li_list = ul.find_all(name='li')
        for li in li_list:
    
            try:
                # 取出新闻的url
                news_url = 'https:' + li.find(name='a').get('href')    #取出属性
                news_title = li.find(name='h3').text        #取出h3标签的文本
                news_desc = li.find(name='p').text          #取出新闻的简介
                news_img = 'https:' + li.find(name='img').get('src')   #取到新闻的图片
    
                print(
                    '''
                    新闻标题:%s
                    新闻摘要:%s
                    新闻地址:%s
                    新闻图片地址:%s        
                    '''%(news_title,news_desc,news_url,news_img)
                )
    
                # 下载新闻的图片
                response = requests.get(news_img)
                time_name = str(time.time()) + '.jpg'
                base_path = os.path.dirname(os.path.abspath(__file__))
                download_path = os.path.join(base_path,'download')
                file_name = os.path.join(download_path,time_name)
                with open(file_name,'wb')as f:
                    f.write(response.content)
    
    
            except Exception as e:
                print(e)
    多线程爬汽车之家新闻
    # import requests
    
    # ret = requests.get('https://dig.chouti.com',
    #                    headers={
    #                        'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    #                    }
    #                    )
    #
    #
    # print(ret.status_code)
    # print(ret.text)
    
    
    # 模拟登入,状态码:9999登入成功,并不能成功点赞
    # ret = requests.post('https://dig.chouti.com/login',
    #                     headers={
    #                         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    #
    #                     },
    #                     data={
    #                         'phone': '8618901847206',
    #                         'password': '123.abcd',
    #                         'oneMonth': 1
    #                     }
    #                     )
    # # print(ret.text)
    # # 取出登入成功后的cookie
    # cookie = ret.cookies.get_dict()
    #
    #
    # # 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求
    # res = requests.post('https://dig.chouti.com/link/vote?linksId=25944651',
    #
    #                     headers={
    #                         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    #                         'referer':'https://dig.chouti.com/'
    #
    #                     },
    #
    #                     cookies = cookie
    #                     )
    # print(res.text)
    抽屉自动点赞-分析
    # 第一步:先打开抽屉首页
    import requests
    from bs4 import BeautifulSoup
    
    ret = requests.get('https://dig.chouti.com/',
    
                       headers={
                           'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    
                       },
    
                       )
    ret_cookie = ret.cookies.get_dict()
    
    # 第二步:模拟登入
    res = requests.post('https://dig.chouti.com/login',
    
                        headers={
                            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
                            'referer': 'https://dig.chouti.com/'
                        },
                        cookies=ret_cookie,
    
                        data={
                            'phone': '8618901847206',
                            'password': '123.abcd',
                            'oneMonth': 1
                        }
    
                        )
    print(res.text)
    res_cookie = res.cookies.get_dict()
    
    # # 第三步:给文章点赞(这个写死了,用下面那个第三步)
    #
    # # 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求
    # response = requests.post('https://dig.chouti.com/link/vote?linksId=25944651',
    #
    #                     headers={
    #                         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    #                         'referer':'https://dig.chouti.com/'
    #
    #                     },
    #
    #                     cookies = ret_cookie
    #                     )
    # print(response.text)
    
    # 登入成功第三步,进入首页
    
    post_url_list = []
    for i in range(5,10):
    
        response = requests.get('https://dig.chouti.com/all/hot/recent/%s'%i,
                                headers={
                                    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
                                    'referer': 'https://dig.chouti.com/'
                                }
                                )
    
        soup = BeautifulSoup(response.text, 'html.parser')
    
        div_all = soup.find(name='div',attrs={'class':'content-list','id':'content-list'})
        div_list = div_all.find_all(name='div',attrs={'class':'news-pic'})
    
        for div in div_list:
    
            try:
                news_id = div.find(name='img').get('lang')
                # 'https://dig.chouti.com/link/vote?linksId=%s'%news_id
                post_url = 'https://dig.chouti.com/link/vote?linksId=%s'%news_id
                post_url_list.append(post_url)
    
            except Exception as e:
                print('这里报错了哟',e)
    
    # print(post_url_list)
    
    
    # 第四步:循环给文章点赞
    # 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求
    
    for url in post_url_list:
        up_news = requests.post(url,
                            headers={
                                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
                                'referer':'https://dig.chouti.com/'
    
                            },
                            cookies = ret_cookie
                            )
        print(up_news.text)
    抽屉自动点赞-实现
    # 第一步:先打开抽屉首页
    import requests
    from bs4 import BeautifulSoup
    
    ret = requests.get('https://dig.chouti.com/',
    
                       headers={
                           'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
    
                       },
    
                       )
    ret_cookie = ret.cookies.get_dict()
    
    # 第二步:模拟登入
    res = requests.post('https://dig.chouti.com/login',
    
                        headers={
                            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
                            'referer': 'https://dig.chouti.com/'
                        },
                        cookies=ret_cookie,
    
                        data={
                            'phone': '8618901847206',
                            'password': '123.abcd',
                            'oneMonth': 1
                        }
    
                        )
    print(res.text)
    res_cookie = res.cookies.get_dict()
    
    # 登入成功第三步,进入首页,获取新闻ID并拼接url加入列表
    
    post_url_list = []
    news_id_list = []
    for i in range(5, 10):
    
        response = requests.get('https://dig.chouti.com/all/hot/recent/%s' % i,
                                headers={
                                    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
                                    'referer': 'https://dig.chouti.com/'
                                }
                                )
    
        soup = BeautifulSoup(response.text, 'html.parser')
    
        div_all = soup.find(name='div', attrs={'class': 'content-list', 'id': 'content-list'})
        div_list = div_all.find_all(name='div', attrs={'class': 'news-pic'})
    
        for div in div_list:
    
            try:
                news_id = div.find(name='img').get('lang')
                news_id_list.append(news_id)
                # 'https://dig.chouti.com/link/vote?linksId=%s'%news_id
                post_url = 'https://dig.chouti.com/link/vote?linksId=%s' % news_id
                post_url_list.append(post_url)
    
            except Exception as e:
                print('这里报错了哟', e)
    
    # 第四步:循环给文章取消点赞
    # 给文章取消点赞,向https://dig.chouti.com/vote/cancel/vote.do 发送post请求,并携带form_data= {linksId: 25933276}
    
    
    url = 'https://dig.chouti.com/vote/cancel/vote.do'
    for news_id in news_id_list:
        up_news = requests.post(url,
                                headers={
                                    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
                                    'referer': 'https://dig.chouti.com/'
    
                                },
                                cookies=ret_cookie,
                                data={
                                    'linksId': news_id
                                }
                                )
        print(up_news.text)
    抽屉自动取消点赞-实现

    四、bs4模块介绍

    # html_doc = """
    # <html><head><title>The Dormouse's story</title></head>
    # <body>
    # <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b></p>
    #
    # <p class="story">Once upon a time there were three little sisters; and their names were
    # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    # and they lived at the bottom of a well.</p>
    #
    # <p class="story">...</p>
    # """
    
    #1、用法
    # from bs4 import BeautifulSoup
    # soup=BeautifulSoup(html_doc,'lxml')
    # # soup=BeautifulSoup(open('a.html'),'lxml')
    #
    # print(soup.p) #存在多个相同的标签则只返回第一个
    # print(soup.p.b.text)    #The Dormouse's story
    # print(soup.p.b.get('class'))    #['boldest']
    # print(soup.a) #存在多个相同的标签则只返回第一个
    #
    # #2、获取标签的名称
    # print(soup.p.name)
    #
    # #3、获取标签的属性
    # print(soup.p.attrs)
    #
    # #4、获取标签的内容
    # print(soup.p.string) # p下的文本只有一个时,取到,否则为None
    # print(soup.p.strings) #拿到一个生成器对象, 取到p下所有的文本内容
    # print(soup.p.text) #取到p下所有的文本内容
    # for line in soup.stripped_strings: #去掉空白
    #     print(line)
    #
    #
    # '''
    # 如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None,如果只有一个子节点那么就输出该子节点的文本,比如下面的这种结构,soup.p.string 返回为None,但soup.p.strings就可以找到所有文本
    # <p id='list-1'>
    #     哈哈哈哈
    #     <a class='sss'>
    #         <span>
    #             <h1>aaaa</h1>
    #         </span>
    #     </a>
    #     <b>bbbbb</b>
    # </p>
    # '''
    #
    # #5、嵌套选择
    # print(soup.head.title.string)
    # print(soup.body.a.string)
    #
    #
    # #6、子节点、子孙节点
    # print(soup.p.contents) #p下所有子节点
    # print(soup.p.children) #得到一个迭代器,包含p下所有子节点
    #
    # for i,child in enumerate(soup.p.children):
    #     print(i,child)
    #
    # print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来
    # for i,child in enumerate(soup.p.descendants):
    #     print(i,child)
    #
    # #7、父节点、祖先节点
    # print(soup.a.parent) #获取a标签的父节点
    # print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...
    #
    #
    # #8、兄弟节点
    # print('=====>')
    # print(soup.a.next_sibling) #下一个兄弟
    # print(soup.a.previous_sibling) #上一个兄弟
    #
    # print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
    # print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象
    1 遍厉文档树
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b></p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>
    """
    
    # 五种过滤器: 字符串,正则表达式, 列表, True, 方法
    # soup.find()
    #find(self, name=None, attrs={}, recursive=True, text=None,**kwargs):
    # name:标签名, attrs:属性, text:文本内容, recursive=False表示不递归查找、默认是True, **kwargs
    
    from bs4 import BeautifulSoup
    
    # 1 字符串精准匹配
    # soup = BeautifulSoup(html_doc,'lxml')
    # # ret = soup.find(name='body')  #指定条件1
    # # ret = soup.find(attrs={'class':'title'})   #指定条件2
    # ret = soup.find(text="The Dormouse's story")    #指定条件3
    # print(ret)
    # print(type(ret))
    
    
    # 2 正则表达式匹配
    # import re
    # soup = BeautifulSoup(html_doc,'lxml')
    # # ret = soup.find(name=re.compile('^p'))
    # # ret = soup.find(attrs={'class':re.compile('^s')})
    # ret = soup.find(name='a',text=re.compile('^L'))
    # print(ret)
    
    
    
    
    # 3 列表匹配
    soup = BeautifulSoup(html_doc,'lxml')
    # ret =soup.find_all(name=['a','b'])
    # ret =soup.find_all(attrs={'class':['title','sister']})
    ret =soup.find_all(text=['Elsie','Lacie'])
    print(ret)
    2 搜索文档树

    五、selenium模块介绍

    # 最基本的用法
    # from selenium import webdriver
    # import time
    #
    #
    # #webdriver.Chrome()得到一个对象,相当于我的浏览器
    # browser = webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # print(browser.page_source)
    #
    # time.sleep(2)
    #
    # #关闭浏览器(务必)
    # browser.close()
    
    #
    # from selenium import webdriver
    # import time
    基本使用
    #### 所有选择器用法
        # 1、find_element_by_id  通过id查找
        # 2、find_element_by_link_text   通过连接文字
        # 3、find_element_by_partial_link_text    通过连接文字模糊查找
        # 4、find_element_by_tag_name    通过标签查找
        # 5、find_element_by_class_name  通过类名查找
        # 6、find_element_by_name
        # 7、find_element_by_css_selector
        # 8、find_element_by_xpath
    #### 所有选择器用用法
    选择器介绍
    #简单使用1 打开百度,在百度搜索栏中搜索美女关键字
    # try:
    #     browser = webdriver.Chrome()
    #     browser.get('https://www.baidu.com')
    #
    #     time.sleep(2)
    #
    #     search_input = browser.find_element_by_id('kw')
    #     key = input('请输入要搜索的内容')
    #     search_input.send_keys(key)
    #     time.sleep(5)
    #
    #
    #
    #
    # except Exception as e:
    #     print(e)
    # finally:
    #     browser.close()
    简单使用-1
    # 简单使用2 打开百度 并完成登入
    # try:
    #     browser = webdriver.Chrome()
    #     # 表示取控件的时候,如果取不到,则等待3秒
    #     # 隐式等待
    #     browser.implicitly_wait(3)
    #     browser.get('https://www.baidu.com')
    #
    #     time.sleep(2)
    #
    #     login_btn = browser.find_element_by_link_text('登录')
    #     login_btn.click()
    #     user_login = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
    #     user_login.click()
    #     username_input = browser.find_element_by_id('TANGRAM__PSP_10__userName')
    #     username_input.send_keys('13681878977')
    #     password_input = browser.find_element_by_id('TANGRAM__PSP_10__password')
    #     password_input.send_keys('zhang319!')
    #     submit_btn = browser.find_element_by_id('TANGRAM__PSP_10__submit')
    #     submit_btn.click()
    #     time.sleep(5)
    #
    #     search_input = browser.find_element_by_id('kw')
    #     search_input.send_keys('名侦探柯南')
    #     time.sleep(10)
    #
    #
    # except Exception as e:
    #     print(e)
    # finally:
    #     browser.close()
    简单使用-2
    # 简单使用3 爬取京东商品信息
    
    # from selenium import webdriver
    # from selenium.webdriver.common.keys import Keys
    # import time
    #
    #
    # def get_goods(browser):
    #     li_list = browser.find_elements_by_class_name('gl-item')
    #     for li in li_list:
    #         goods_price = li.find_element_by_css_selector('.p-price i').text
    #         # print(goods_price)
    #
    #         goods_comment = li.find_element_by_css_selector('.p-commit strong a').text
    #         # print(goods_comment)
    #
    #         goods_name = li.find_element_by_css_selector('.p-name-type-2 a').get_attribute('title')
    #         # print(goods_name)
    #
    #         goods_url = li.find_element_by_css_selector('.p-name-type-2 a').get_attribute('href')
    #
    #         goods_img = li.find_element_by_css_selector('.p-img a img').get_attribute('src')
    #         if not goods_img:
    #             goods_img = 'https:'+li.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
    #
    #         print(
    #             '''
    #             商品名称:%s
    #             商品价格:%s
    #             商品评论数:%s
    #             商品详情地址:%s
    #             商品图片地址:%s
    #             '''% (goods_name, goods_price, goods_comment, goods_url, goods_img)
    #         )
    #
    #     next_page = browser.find_element_by_partial_link_text('下一页')
    #     time.sleep(2)
    #     next_page.click()
    #     get_goods(browser)
    #
    #
    #
    #
    #
    #
    #
    #
    # def spider():
    #     try:
    #         browser = webdriver.Chrome()
    #         browser.implicitly_wait(3)
    #
    #         browser.get('https://www.jd.com')
    #
    #         search_input = browser.find_element_by_id('key')
    #         search_input.send_keys('手机')
    #         search_input.send_keys(Keys.ENTER)
    #         time.sleep(5)
    #
    #
    #         #取出页面中商品信息
    #         get_goods(browser)
    #
    #
    #     except Exception as e:
    #         print(e)
    #     finally:
    #         browser.close()
    #
    # if __name__ == '__main__':
    #     spider()
    简单使用-3 (爬取京东商品信息)
    # 简单实用4 模拟浏览器前进后退
    # import time
    # from selenium import webdriver
    #
    # browser=webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # browser.get('https://www.taobao.com')
    # browser.get('http://www.sina.com.cn/')
    #
    # browser.back()
    # time.sleep(10)
    # browser.forward()
    # browser.close()
    
    
    # 执行js代码
    # from selenium import webdriver
    # import time
    #
    # browser=webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # browser.execute_script('alert(1234)')
    # #执行后js代码后,.close()不能生效
    # browser.close()
    
    
    
    #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式
    # import time
    # from selenium import webdriver
    #
    # browser=webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # browser.execute_script('window.open()')
    #
    # print(browser.window_handles) #获取所有的选项卡
    # browser.switch_to_window(browser.window_handles[1])
    # browser.get('https://www.taobao.com')
    # time.sleep(5)
    # browser.switch_to_window(browser.window_handles[0])
    # browser.get('https://www.sina.com.cn')
    # browser.close()
    
    
    
    
    # 控制鼠标滑动
    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    # from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    # from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    # from selenium.webdriver.support import expected_conditions as EC
    # from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    # import time
    #
    # driver = webdriver.Chrome()
    # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    # wait=WebDriverWait(driver,3)
    # # driver.implicitly_wait(3)  # 使用隐式等待
    #
    # try:
    #     driver.switch_to.frame('iframeResult') ##切换到iframeResult
    #     sourse=driver.find_element_by_id('draggable')
    #     target=driver.find_element_by_id('droppable')
    #
    #     #方式一:基于同一个动作链串行执行
    #     # actions=ActionChains(driver) #拿到动作链对象
    #     # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
    #     # actions.perform()
    #
    #     #方式二:不同的动作链,每次移动的位移都不同
    #     ActionChains(driver).click_and_hold(sourse).perform()
    #     distance=target.location['x']-sourse.location['x']
    #
    #     track=0
    #     while track < distance:
    #         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
    #         track+=2
    #
    #     ActionChains(driver).release().perform()
    #
    #     time.sleep(10)
    #
    # finally:
    #     driver.close()
    简单使用-4 模拟浏览器操作

    六、微信自动回复机器人

    # 好友性别饼状图
    # from wxpy import *
    # from pyecharts import Pie
    # # import webbrowser
    # bot = Bot(cache_path=True)  # 注意手机登陆确认
    #
    # # 拿到所有朋友对象,放到列表里
    # friends = bot.friends()
    #
    # attr = ['男朋友','女朋友','未知性别']
    # value = [0,0,0]
    #
    # for friend in friends:
    #     if friend.sex == 1: #1代表男性
    #         value[0] +=1
    #     elif friend.sex == 2:   #2代表女性
    #         value[1] += 1
    #     else:
    #         value[2] += 1
    #
    # pie = Pie('朋友男女比例')
    # pie.add("",attr, value, is_label_show=True)
    #
    # # 图表名称str , 属性名称list , 属性所对应的值list  , is_label_show是否现在标签
    # pie.render('sex.html')
    #
    # #打开浏览器
    # from selenium import webdriver
    # import time
    # browser = webdriver.Chrome()
    # browser.get('/Users/lich/PycharmProjects/w3spider_Proj/sex.html')
    # time.sleep(10)
    # browser.close()
    好友统计饼状图
    # 好友地域省份分布图
    # from wxpy import *
    # from pyecharts import Map
    # from pyecharts import Pie
    # import webbrowser
    # bot = Bot(cache_path=True)  # 注意手机登陆确认
    #
    # # 拿到所有朋友对象,放到列表里
    # friends = bot.friends()
    #
    # area_dic = {}       #定义一个省份字典
    #
    # for friend in friends:
    #     if friend.province not in area_dic:
    #         area_dic[friend.province] = 1
    #     else:
    #         area_dic[friend.province] += 1
    # attr = area_dic.keys()
    # value = area_dic.values()
    #
    # map = Map('好朋友们的地域分布', width= 1200,height=600)
    # map.add(
    #     "好友地域分布",
    #     attr,
    #     value,
    #     maptype='china',
    #     is_visualmap = True, #结合体VisualMap
    #
    # )
    #
    # # is_visualmap -> bool 是否使用视觉映射组件
    # map.render('area.html')
    好友省份地域分布图
    # 全好友自动回复:
    # from wxpy import *
    # bot=Bot(cache_path=True)
    #
    # @bot.register()
    # def recv_send_msg(recv_msg):
    #     print('收到的消息:',recv_msg.text) # recv_msg.text取得文本
    #     return '自动回复:%s' %recv_msg.text
    #
    # # 进入Python命令行,让程序保持运行
    # embed()
    全好友自动回复
    #自动给老婆回复信息
    # from wxpy import *
    # bot=Bot(cache_path=True)
    #
    # girl_friend=bot.search('老婆')[0]
    # print(girl_friend)
    #
    # @bot.register() # 接收从指定好友发来的消息,发送者即recv_msg.sender为指定好友girl_friend
    # def recv_send_msg(recv_msg):
    #     # print('收到的消息:',recv_msg.text) # recv_msg.text取得文本
    #     if recv_msg.sender == girl_friend:
    #         recv_msg.forward(bot.file_helper,prefix='老婆留言: ') #在文件传输助手里留一份,方便自己忙完了回头查看
    #         ms='老婆最美丽,我对老婆的爱如滔滔江水,连绵不绝'
    #         print('>>>给老婆回复的:', ms)
    #         return  ms#给老婆回一份
    #
    # embed()
    自动给老婆回复-1
    #使用图灵机器人自动回复
    
    import json
    import requests
    from wxpy import *
    bot = Bot(cache_path=True)
    
    # 调用图灵机器人API,发送消息并获得机器人的回复
    def auto_reply(text):
        url = "http://www.tuling123.com/openapi/api"
        api_key = "9df516a74fc443769b233b01e8536a42"
        payload = {
            "key": api_key,
            "info": text,
        }
        r = requests.post(url, data=json.dumps(payload))
        result = json.loads(r.content)
        # return "[来自智能机器人] " + result["text"]
        return result["text"]
    
    
    girl_friend=bot.search('老婆')[0]
    
    @bot.register()
    def forward_message(msg):
        if msg.sender == girl_friend:
            return auto_reply(msg.text)
    
    
    embed()
    自动给老婆回复-2 (使用图灵机器人)
  • 相关阅读:
    redis skiplist性能实验
    python批量插mysql 2种方法 对比 装饰器timer
    数仓 & 数据湖小结
    mongo 增量同步之 MongoShake(3) kafka python处理kafka oplog mongoUtils
    mongo 增量同步之 MongoShake(2) kafka json to python json
    python swich case
    mongodb 增量同步之 MongoShake(1)
    redis scan & kyes p*
    SpringContextUtils Spring 工具类
    Google EventBus使用详解
  • 原文地址:https://www.cnblogs.com/lich1x/p/10803918.html
Copyright © 2020-2023  润新知