• 爬虫小案例:豆瓣Top250电影


    获取豆瓣Top250电影列表,然后给自己发邮件

    直接上代码:

    import requests,os,csv,time,smtplib
    from email.mime.text import MIMEText
    from email.utils import formataddr
    from email.header import Header
    from email.header import  make_header
    from email.mime.multipart import MIMEMultipart
    
    from bs4 import BeautifulSoup
    
    # 数据写入到文件
    file_path = os.getcwd() + "/豆瓣Top250电影.csv"
    if not os.path.isfile(file_path):
        # 编码utf-8-sig:支持python3,不支持python2
        with open(file_path, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            writer.writerow(['排名', '电影名称', '上映年份', '地区', '类型', '评分', '推荐语', '链接'])
    # 电影列表
    filmlist = []
    for x in range(10):
        url = 'https://movie.douban.com/top250?start={}&filter='.format(x * 25)
    
        # 为躲避反爬机制,伪装成浏览器的请求头
        headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 OPR/65.0.3467.78 (Edition Baidu)'}
        res = requests.get(url, headers=headers)
        if res.status_code == 200:
            print('正获取第{}页电影数据...'.format(x+1))
            htmltext = res.text
            soup = BeautifulSoup(htmltext, 'html.parser')
            ol = soup.find('ol', class_='grid_view')
            for li in ol.find_all('li'):
                # 排名
                num = li.find('div', class_='pic').find('em').text
    
                info = li.find('div', class_='info')
    
                # 标题
                title = []
                title_spans = info.find('div', class_='hd').find('a').find_all('span')
                for title_span in title_spans:
                    title.append(title_span.text)
                title = ''.join(title)
    
                # 链接
                link = info.find('div', class_='hd').find('a')['href']
    
                # 评分
                rating_num = info.find('span', class_='rating_num').text
    
                # 推荐语
                inq = info.find('span', class_='inq').text
    
                # 上映时间、地区、类型
                bd = info.find('div', class_='bd').find('p').contents[2]
                bd = bd.split('/')
    
                # 保存到文件上
                with open(file_path, 'a', newline='', encoding='utf-8-sig') as f:
                    writer = csv.writer(f)
                    writer.writerow([num, title, bd[0].strip(), bd[1].strip(), bd[2].strip(), rating_num, inq, link])
    
                filmlist.append("{0}.{1}:{2}/{3}/{4},评分:{5},推荐语:{6},链接:{7}".format(num,title,bd[0].strip(),bd[1].strip(),bd[2].strip(),rating_num,inq,link))
    
            time.sleep(0.75)
                # print(num)
                # print(title)
                # print(link)
                # print(rating_num)
                # print(inq)
                # print(bd)
                # print('{0}.{1} —— {2},推荐语:{3},链接:{4}'.format(rating_num,title,inq,link))
    
            
        else:
            print('请求失败!')
    
    # 发送邮件
    my_sender = 'xxx'  # 发件人邮箱账号
    my_pass = 'xxx'  # 发件人邮箱密码
    
    try:
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + ":准备发送邮件")
    
        # 创建一个带附件的实例
        # 使用多形式组合
        msg = MIMEMultipart()
        msg['From'] = formataddr(["ljq", my_sender])  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
        msg['To'] = formataddr(["lsjljq", 'lsjljq@163.com'])  # 括号里的对应收件人邮箱昵称、收件人邮箱账号
    
        # 邮件标题
        subject = '豆瓣Top250电影'
        msg['Subject'] = Header(subject, 'utf-8')
    
        # 邮件正文内容
        contenttext = "
    ".join(filmlist)
        msg.attach(MIMEText(contenttext, 'plain', 'utf-8'))
    
        # 构造附件1,传送当前目录下的 test.txt 文件
        att1 = MIMEText(open('豆瓣Top250电影.csv', 'rb').read(), 'base64', 'utf-8')
        # 文件名如果是中文,则需要转化一下
        att1["Content-Type"] = 'application/octet-stream;name="{0}"'.format(make_header([('豆瓣Top250电影', 'UTF-8')]).encode('UTF-8'))
        # 这里的filename可以任意写,写什么名字,邮件中显示什么名字
        att1["Content-Disposition"] = 'attachment; filename="{0}.csv"'.format(make_header([('豆瓣Top250电影', 'UTF-8')]).encode('UTF-8'))
        msg.attach(att1)
    
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + ":正连接邮件服务器...")
        server = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)  # 发件人邮箱中的SMTP服务器,端口是25
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + ":登录中...")
        server.login(my_sender, my_pass)  # 括号中对应的是发件人邮箱账号、邮箱密码
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + ":正在发送邮件...")
        server.sendmail(my_sender, ['lsjljq@163.com'], msg.as_string())  # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
        server.quit()  # 关闭连接
        print('邮件发送成功!')
    except Exception as err:
        print('邮件发送失败!:{0}'.format(err))
  • 相关阅读:
    监听属性改变defineProperty和文档碎片createDocumentFragment
    this指向bind、call、apply
    css mask文字渐变+clip-path裁剪路径+border-image图片边框
    浅谈 Hybrid App
    activiti与flowable的区别(转)
    JAVA:定时器的三种方法(详细注解)
    Activiti5
    别再写满屏的try-catch了,真丑,全局异常处理不会吗?(转)
    共享锁、排他锁、互斥锁、悲观锁、乐观锁、行锁、表锁、页面锁、不可重复读、丢失修改、读脏数据...(转)
    什么是跨域?跨域解决方法(转)
  • 原文地址:https://www.cnblogs.com/KeenLeung/p/12157923.html
Copyright © 2020-2023  润新知