• 爬虫练习


    爬虫小项目

    0、爬取大学排名

    import bs4
    import requests
    from bs4 import BeautifulSoup
    
    
    # 通过传入网址信息创建一个获取网页文本的函数
    def getHTMLText(url):
        # 判断获取网页文本过程中是否有错误
        try:
            # 打开网址获取文本,并且把延迟设置成30s
            r = requests.get(url, timeout=30)
            # 获取状态码
            r.raise_for_status()
            # 设置文件编码
            r.encoding = r.apparent_encoding
            # 如果成功返回网页文本
            return r.text
        except:
            # 获取网页文本有错误则返回空文本
            return ""
    
    
    # 通过传入空列表和网页文本信息创建一个在大学列表中加入大学信息的函数
    def fillUnivList(ulist, html):
        # 用BeautifulSoup将网页文本以’html.parser‘煮成一锅粥
        soup = BeautifulSoup(html, "html.parser")
        # 通过网页源代码我们可以发现我们需要的信息都在tbody标签内,因此我们循环找出’tbody‘标签及其子标签的内容
        for tr in soup.find('tbody').children:
            # 通过bs4.element.Tag判断是否为tr标签
            if isinstance(tr, bs4.element.Tag):
                # 对于tr标签的我们拿到tr标签里的td标签
                tds = tr('td')
                # [<td>1</td>, <td><div align="left">清华大学</div></td>, <td>北京</td>, <td>95.3</td>...
                # 我们通过筛选出我们需要的td标签中的文本并将其用列表的方式加入我们传入的列表ulist中
                ulist.append([tds[0].string, tds[1].string,
                              tds[2].string, tds[3].string])
    
    
    # 通过传入学校列表信息创建一个打印大学列表的函数
    def printUnivList(ulist,province):
        # 打印标题
        print("中国最好大学排名2018({}地区)".center(45, '-').format(province))
        # 设置一个format格式化的模板
        # 注意:这里的{4}是因为utf8格式的英文和中文字节数不同,python会自动用英文来填
        tplt = "{0:^10}	{1:{4}^10}	{2:^10}	{3:^10}"
        # 充空白位置,我们用chr(12288)将其修改成用中文填充空白位置
        # 打印第一行
        print(tplt.format("排名", "学校名称", "地区", "总分", chr(12288)))
        if province == '安徽':
            print(tplt.format(1, '安徽师范大学花津校区', '安徽', 99.9, chr(12288)))
        # 循环取出列表中的每一所大学的信息,取出的大学信息是列表的形式(可以控制range(len(ulist))的长度来控制想要打印的学校的数量)
        for i in range(len(ulist)):
            # 将每一所大学的信息以列表的形式赋值给u
            u = ulist[i]
            # u[2]是地区,判断是否为安徽地区(可以自己更改地区信息,如果删除该判断,打印所有学校信息,也可以更改判断条件)
            if u[2] == province:
                # 如果为安徽地区,我们打印属于安徽地区的每一所大学的信息
                print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))
    
    
    # 创建一个运行函数
    def main(province='安徽'):
        # 创建一个空列表,为填充大学信息列表做准备
        uinfo = []
        # 定义一个想要爬取的网页
        url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
        # 传入想要爬取的网页获取该网页文本信息
        html = getHTMLText(url)
        # 给填充大学信息函数传值
        fillUnivList(uinfo, html)
        # 给打印大学信息函数传值
        printUnivList(uinfo,province=province)
    
    
    main()
    # main(province='北京')

    1、爬取豆瓣250

    import requests
    import time
    from openpyxl import Workbook
    from bs4 import BeautifulSoup
    
    wb = Workbook()
    sheet = wb.active
    count = 1
    for i in range(0,100,25):
        ret = requests.get('https://movie.douban.com/top250?start=%s&filter='%(i))
        bs = BeautifulSoup(ret.text,'html.parser')
        ol = bs.find(name='ol',attrs={'class':'grid_view'})
        li_list = ol.find_all(name='li')
        sheet.title = '好评电影'
        sheet['A1'].value = '序号'
        sheet['B1'].value = '电影名称'
        sheet['C1'].value = '电影评分'
        sheet['D1'].value = '电影链接'
        sheet['E1'].value = '电影图片'
        for li in li_list:
            name = li.find(name='span',attrs={'class':'title'})
            a = li.find(name='a')
            span = li.find(name='span',attrs={'class':'rating_num'})
            img = a.find(name='img')
            count += 1
            sheet['A%s'%(count)].value = count - 1
            sheet['B%s'%(count)].value = name.text
            sheet['C%s'%(count)].value = span.text
            sheet['D%s'%(count)].value = a['href']
            sheet['E%s'%(count)].value = img['src']
        time.sleep(1)
    wb.save('好评电影.xlsx')

    2、爬取汽车之家

    import requests
    from bs4 import BeautifulSoup
    from openpyxl import Workbook
    
    
    def run(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
        }
        response = requests.get(url,headers=headers)
        response.encoding = 'gbk'
        soup = BeautifulSoup(response.text,'html.parser')
        # 获取ul
        ul = soup.find(name='ul',attrs={"class":"article"})
        # 获取所有的li
        li_list = ul.find_all(name='li')
        infos = []
        for li in li_list:
            name = li.find(name="h3")
            name1 = ""
            if name:
                name1 = (name.text)
            href = li.find(name='a')
            href1 = ""
            if href:
                href1 = ('http:'+href['href'])
            info = li.find(name='p')
            info1 = ""
            if info:
                info1 = (info.text)
            infos.append({"title":name1,"href":href1,"info":info1})
        print(infos)
    
    if __name__ == '__main__':
        url = 'https://www.autohome.com.cn/news/'
        run(url)

    3、爬取斗图表情包

    import requests
    from bs4 import BeautifulSoup
    ret = requests.get('https://www.doutula.com/photo/list?page=0')
    bs = BeautifulSoup(ret.text,'html.parser')
    div = bs.find(name='div',attrs={'class':'page-content text-center'})
    
    a_list = div.find_all(name='a')
    for a in a_list:
        img = a.find(name='img')
        img_name = img.get('alt')
        img_url = img.get('data-backup')
    
        if img_name and img_url:
               # print(img_name)
               # print(img_url)
               f = open('表情包/%s.jpg'%(img_name),'wb')
               ret_img = requests.get(img_url)
               f.write(ret_img.content)        

    4、爬取梨视频

    import requests
    import re
    from bs4 import BeautifulSoup
    
    ret = requests.get('https://www.pearvideo.com/')
    print(ret.text)
    
    bs = BeautifulSoup(ret.text,'html.parser')
    div_list = bs.find_all(name='div',attrs={'class':'vervideo-tbd'})
    
    num = 0
    for div in div_list:
        a = div.find(name='a')
        video_url = 'https://www.pearvideo.com/' + a.get('href')
        video_ret = requests.get(video_url)
    
        mp4_url = re.search('(https://)[^s]+mp4',video_ret.text).group()
        print(mp4_url)
        mp4_ret = requests.get(mp4_url)
        f = open('梨视频%s.mp4'%(num),'wb')
        f.write(mp4_ret.content)
        num += 1

    实现在线翻译功能

    import requests
    import json
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
    
    
    def main(keys=''):
        url = 'http://fy.iciba.com/ajax.php?a=fy'
        data = {
            'f': 'auto',
            't': 'auto',
            'w': keys
        }
        response = requests.post(url,headers=headers,data=data)
        info = response.text
        data_list = json.loads(info)
        try:
            val = data_list['content']['word_mean'] # 中文转英文
        except:
            val = data_list['content']['out']  # 英文转中文
        return val
    
    if __name__ == '__main__':
        keys = input('请输入需要翻译的英文或者中文...')
        if not keys:
              print('请您正确输入需要翻译的中文或者英文...')
        else:
            data = main(keys)
            print(data)

    selenium小项目

    开胃菜

    # 百度搜索老男孩
    from selenium import webdriver
    # 打开浏览器
    b = webdriver.Chrome()
    # 请求百度
    b.get('https://www.baidu.com')
    # 找到百度的input输入框的标识符 id:kw
    ele = b.find_element_by_id('kw')
    # 清除输入框信息
    ele.clear()
    # 输入 老男孩 
    ele.send_keys('老男孩')
    # 查找点击按钮节点
    su = b.find_element_by_id('su')
    # 点击按钮
    su.click()

    爬取京东商城

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    import time
    
    def get_goods(driver):
        try:
            goods = driver.find_elements_by_class_name('gl-item')
    
            for good in goods:
                detail_url = good.find_element_by_tag_name('a').get_attribute('href')
    
                p_name = good.find_element_by_css_selector('.p-name em').text.replace('
    ','')
                price = good.find_element_by_css_selector('.p-price i').text
                p_commit = good.find_element_by_css_selector('.p-commit a').text
                msg = '''
                商品 : %s
                链接 : %s
                价钱 :%s
                评论 :%s
                ''' % (p_name, detail_url, price, p_commit)
    
                print(msg, end='
    
    ')
    
            button = driver.find_element_by_partial_link_text('下一页')
            button.click()
            time.sleep(1)
            get_goods(driver)
        except Exception:
            pass
    
    
    def spider(url, keyword):
        driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(3)  # 使用隐式等待
        try:
            input_tag = driver.find_element_by_id('key')
            input_tag.send_keys(keyword)
            input_tag.send_keys(Keys.ENTER)
            get_goods(driver)
        finally:
            driver.close()
    
    
    if __name__ == '__main__':
        spider('https://www.jd.com/', keyword='华为P30')

    爬虫与数据分析之雨女无瓜

    import requests
    from bs4 import BeautifulSoup
    import datetime
    import pandas as pd
    import matplotlib.pyplot as plt
    import re
    import jieba
    import numpy as np
    from scipy.misc import imread
    from wordcloud import WordCloud, ImageColorGenerator
    
    url = "https://comment.bilibili.com/92542241.xml"
    r = requests.get(url)
    r.encoding = 'utf8'
    
    
    soup = BeautifulSoup(r.text,'lxml')
    d = soup.find_all('d')
    
    dlst = []
    n = 0
    for i in d:
        n += 1
        danmuku = {}
        danmuku['弹幕'] = i.text
        danmuku['网址'] = url
        danmuku['时间'] = datetime.date.today()
        dlst.append(danmuku)
    
    df = pd.DataFrame(dlst)
    
    with open('sign.txt','w',encoding='utf8') as f:
        for text in df['弹幕'].values:
            pattern = re.compile(r'[一-龥]+')
            filter_data = re.findall(pattern,text)
            f.write("".join(filter_data))
    
    with open('sign.txt', 'r', encoding='utf8') as f:
        data = f.read()
        segment = jieba.lcut(data)
        words_df = pd.DataFrame({"segment": segment})
    
    word_stat = words_df.groupby(by=['segment'])['segment'].agg({'计数':np.size})
    words_stat = word_stat.reset_index().sort_values(by=['计数'],ascending=False)
    
    color_mask = imread('01.jpg')
    
    wordcloud = WordCloud(
        # font_path="simhei.ttf",   # mac上没有该字体
        font_path="C:WindowsFontssimkai.ttf",
        # 设置字体可以显示中文
        background_color="white",  # 背景颜色
        max_words=3000,  # 词云显示的最大词数
        mask=color_mask,  # 设置背景图片
        max_font_size=200,  # 字体最大值
        random_state=100,
        width=1000, height=860, margin=2,
        # 设置图片默认的大小,但是如果使用背景图片的话,                                                   # 那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
    )
    
    # 生成词云, 可以用generate输入全部文本,也可以我们计算好词频后使用generate_from_frequencies函数
    word_frequence = {x[0]: x[1] for x in words_stat.head(500).values}
    word_frequence_dict = {}
    for key in word_frequence:
        word_frequence_dict[key] = word_frequence[key]
    
    wordcloud.generate_from_frequencies(word_frequence_dict)
    # 从背景图片生成颜色值
    # image_colors = ImageColorGenerator(color_mask)
    # 重新上色
    # wordcloud.recolor(color_func=image_colors)
    # 保存图片
    wordcloud.to_file('output.png')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
  • 相关阅读:
    25.清除浮动的方法
    23、flex响应式布局原理
    22.垂直上下居中的方法
    web自动化测试(十二)----类似日期选择框
    web自动化测试(十一)------滚动条
    web自动化测试(九)---下拉列表
    web自动化测试(八)---鼠标操作
    web自动化测试(五)——xpath定位
    【C++】《C++ Primer 》第十五章
    【C++】《C++ Primer 》第十四章
  • 原文地址:https://www.cnblogs.com/Yang-Sen/p/11129889.html
Copyright © 2020-2023  润新知