• Python爬取网上笑话并定时邮件发送--P.S.想经常收笑话的同学,可以邮件我


    我的博客

    效果如下图

    1

    2

    Author : Leon
    Email  : yangli0534@yahoo.com
    Description: 1 grab a joke from the Internet              2 email to someone on schedule

      1 # -*- coding:gb2312 -*-
      2 
      3 #!/usr/bin/python
      4 __author__ = 'Leon'
      5 
      6 '''
      7     Author : Leon
      8     Email  : yangli0534@yahoo.com
      9     Description: 1 grab a joke from the Internet 
     10                  2 email to someone on schedule 
     11                  3 参考了网友李鹏飞关于抓取网页的部分 ,感谢。侵权删
     12 '''
     13 import smtplib
     14 from email.MIMEMultipart import MIMEMultipart
     15 from email.MIMEText import MIMEText
     16 import urllib2
     17 import re
     18 import schedule
     19 import time
     20 import datetime
     21 
     22 class randomJoke:
     23 
     24     #初始化方法
     25     def __init__(self):
     26         self.url = 'http://lengxiaohua.com/random'
     27         self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
     28         #初始化headers
     29         self.headers = { 'User-Agent' : self.user_agent }
     30         #笑话内容
     31         self.content = []
     32 
     33     #获取网页源代码
     34     def getSourceCode(self):
     35         try:
     36             request = urllib2.Request(url = self.url, headers=self.headers)
     37             response = urllib2.urlopen(request)
     38             sourceCode = response.read().decode('utf-8')
     39             return sourceCode
     40         except urllib2.URLError, e:
     41             if hasattr(e,"reason"):
     42                 print u"网络错误...",e.reason
     43                 return None
     44 
     45     #获取笑话
     46     def setContent(self):
     47         sourceCode = self.getSourceCode()
     48         if not sourceCode:
     49             print('获取网页内容失败~!')
     50             quit()
     51         pattern = re.compile(' <pre.*?js="joke_summary".*?"first_char">(.*?)</span>(.*?)</pre>.*?class="user_info">.*?<a.*?>(.*?)</a>.*?(.*?)',re.S)
     52         items = re.findall(pattern,sourceCode)
     53         self.content = items
     54         #print u"已经爬取源代码...正在解析源代码..."
     55 
     56     #返回笑话
     57     def getContent(self):
     58         return self.content
     59 
     60     #打印一则笑话
     61     def printAJoke(self,number):
     62         joke = self.content[number]
     63         print u"作者:%s" %(joke[2])
     64         print u'发表于:'+ joke[3]
     65         #item[0]和item[1]组成完整的内容
     66         print joke[0]+joke[1]
     67 
     68     def getAJoke(self,number):
     69         joke = self.content[number]
     70         content = ""
     71         #content = content+ u"作者:" %(joke[2])
     72         #print u'发表于:'+ joke[3]
     73         #item[0]和item[1]组成完整的内容
     74         content =  joke[0]+joke[1]
     75         return content
     76 ## run the task on chedule
     77 def job():
     78     global myRandomJoke
     79     #global server
     80     global toaddr
     81     global fromaddr
     82     global password
     83     t = datetime.datetime.now()
     84     content = ""
     85     content = content+ u"你好,这里是随机笑话!"
     86     content = content+ "It's "
     87     content = content+ t.strftime("%A, %d. %B %Y %I:%M%p")+'
    '
     88     myRandomJoke.setContent()
     89     #myRandomJoke.printAJoke(2)
     90     content = content+myRandomJoke.getAJoke(2)
     91     print content
     92     msg = MIMEMultipart()
     93     msg['From'] = fromaddr
     94     msg['To'] = toaddr
     95     msg['Subject'] = "Leon send a joke for u on"+t.strftime("%A, %d. %B %Y %I:%M%p")
     96     
     97     try:
     98         body = "YOUR MESSAGE HERE"
     99         #msg.attach(MIMEText(content, 'plain'))
    100         #msg.attach(MIMEText(content, 'plain'))
    101         msg.attach(MIMEText(content,format,'utf-8'))
    102         text = msg.as_string()
    103         #server = smtplib.SMTP_SSL("smtp.126.com", 25)# connect to email server
    104         server = smtplib.SMTP("smtp.126.com")# connect to email server
    105         server.login(fromaddr,password)
    106         server.sendmail(fromaddr, toaddr, text)
    107         server.quit()
    108         print "send email successfully"
    109     except:
    110         print "failed!"
    111 
    112 toaddr = "somebody@yahoo.com" # email address to send
    113 
    114 fromaddr = "yourname@126.com"
    115 password = "xxxxxxxxxxx"#
    116 #server = smtplib.SMTP('smtp.yahoo.com', 587, None, 30)
    117 #server = smtplib.SMTP_SSL('smtp.googlemail.com', 465)
    118 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
    119 
    120 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
    121 #server.login(fromaddr,password)
    122 myRandomJoke = randomJoke()
    123 job()
    124 schedule.every(2).minutes.do(job)#run every 2 minutes
    125 #notQuit = True
    126 #print u"你好,这里是随机笑话!"
    127 while True:
    128     schedule.run_pending()
    129     time.sleep(10)
    130     
    131 
    132 
    133 server.quit()
    134 quit()

     遇到的问题有1.SMTP的问题,和具体的邮箱相关。

    2.汉字编码。使用utf-8编码的内容,在一些客户端中无法显示,修改为gbk编码后解决。

    经验,尽量使用开发测试完善的package, 会更稳定。比如,这个定时程序,可以用datetime猎取时间然后判断实现,性能不如使用schedule模块。

    更新

    因为发现上述抓取笑话的网站更新不及时,于是更新为糗事百科上抓取文本,更新后的代码如下:

      1                      # -*- coding:utf-8 -*-
      2 
      3 import urllib
      4 import urllib2
      5 import re
      6 import thread
      7 import time
      8 import random
      9 #糗事百科爬虫类,在网友代码基础上修改
     10 class qiushibaike:
     11 
     12     #初始化方法,定义一些变量
     13     def __init__(self):
     14         #self.pageIndex = 30
     15         #user_agent 从火狐 HttpFox中headers查找到
     16         #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0)'
     17         self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
     18         #初始化headers
     19         self.headers = { 'User-Agent' : self.user_agent }
     20         #存放段子的变量,每一个元素是每一页的段子们
     21         self.stories = []
     22         #存放程序是否继续运行的变量
     23         self.enable = False
     24         self.pageStories = []
     25         
     26     #传入某一页的索引获得页面代码
     27     def getPage(self):
     28         try:
     29             pageIndex = random.randint(2,35)
     30             url = 'https://www.qiushibaike.com/text/page/' + str(pageIndex)+'/'
     31             #构建请求的request
     32             request = urllib2.Request(url,headers = self.headers)
     33             #利用urlopen获取页面代码
     34             response = urllib2.urlopen(request)
     35             #将页面转化为UTF-8编码
     36             pageCode = response.read().decode('utf-8')
     37             return pageCode
     38 
     39         except urllib2.URLError, e:
     40             if hasattr(e,"reason"):
     41                 print u"连接糗事百科失败,错误原因",e.reason
     42                 return None
     43 
     44 
     45     #传入某一页代码,返回本页不带图片的段子列表
     46     def getPageItems(self):
     47         pageCode = self.getPage()
     48         if not pageCode:
     49             print "页面加载失败...."
     50             return None
     51         pattern = re.compile('<div class="author clearfix">.*?href.*?<img src.*?title=.*?<h2>(.*?)</h2>.*?<div class="content">(.*?)</div>.*?<i class="number">(.*?)</i>',re.S)
     52         items = re.findall(pattern,pageCode)
     53         #print len(items)
     54         #print "*****items"
     55         #用来存储每页的段子们
     56         self.pageStories = []
     57         #遍历正则表达式匹配的信息
     58         i = 0
     59         for item in items:
     60             #如果不含有图片,把它加入list中
     61             #if not haveImg:
     62                 replaceBR = re.compile('<.?span>')
     63                 #将<br/> 用 换行符
     替换
     64                 text = re.sub(replaceBR,"
    ",item[1])
     65                 #item[0]是一个段子的发布者,item[1]是内容,item[2]是发布时间,item[4]是点赞数
     66                 self.pageStories.append([item[0],text,item[2]])
     67 
     68                 #strip()作用:去掉转义字符后输出
     69         #print item[0] +"-------0t"
     70         #print item[1] +"-------0T"
     71         #print text + "******TEXT"
     72         #print item[2] +"-------0t"
     73        
     74         #return pageStories
     75 
     76     #加载并提取页面的内容,加入到列表中
     77     def loadPage(self):
     78         #如果当前未看的页数少于2页,则加载新一页
     79         if self.enable == True:
     80             if len(self.stories) < 2:
     81                 #获取新一页
     82                 #pageStories = self.getPageItems()
     83                 self.getPageItems()
     84                 #将该页的段子存放到全局list中
     85                 if self.pageStories:
     86                     self.stories.append(self.pageStories)
     87                     #获取完之后页码索引加一,表示下次读取下一页
     88                     #self.pageIndex += 1
     89 
     90                 #print len(self.stories)
     91     #调用该方法,每次敲回车打印输出一个段子
     92     def getOneStory(self):
     93         #遍历一页的段子
     94 
     95         # for story in pageStories:
     96         #     #等待用户输入
     97         #     input = raw_input()
     98         #     #每当输入回车一次,判断一下是否要加载新页面
     99         #     self.loadPage()
    100         #     #如果输入Q则程序结束
    101         #     if input == "Q":
    102         #         self.enable = False
    103         #         return
    104         #     #print "$$$"
    105         #     print len(story)
    106         #     #现在网页已没有发布时间了
    107         #     #print u"第%d页	发布人:%s	发布时间:%s	赞:%s
    %s" %(page,story[0],story[2],story[3],story[1])
    108         #     print u"第%d页	发布人:%s	赞:%s
    %s" %(page,story[0],story[2],story[1])
    109         self.loadPage()
    110         len_page = len(self.pageStories)
    111         story = self.pageStories[random.randint(0, len_page-1)]
    112         print u'回车看下一个,Q退出'
    113         input = raw_input()
    114         #     #每当输入回车一次,判断一下是否要加载新页面
    115         #     self.loadPage()
    116         #     #如果输入Q则程序结束
    117         if input == "Q":
    118             self.enable = False
    119             return
    120         #print len(story)
    121         print u'%s' %story[1]
    122     #开始方法
    123     def start(self):
    124         print u"正在读取糗事百科,按回车查看新段子,Q退出"
    125         #使变量为True,程序可以正常运行
    126         self.enable = True
    127        
    128         #先加载一页内容
    129         self.loadPage()
    130         
    131         #局部变量,控制当前读到了第几页
    132         #nowPage = 0
    133         while self.enable:
    134             if len(self.stories)>0:
    135                 #print len(self.stories)
    136                 #print "-------stories"
    137                 #从全局list中获取一页的段子
    138                 self.pageStories = self.stories[0]
    139                 #当前读到的页数加一
    140                 #nowPage += 1
    141                 #将全局list中第一个元素删除,因为已经取出
    142                 del self.stories[0]
    143                 #print "---------------------------------"
    144                 #print len(pageStories)
    145                 #print nowPage
    146                 #输出该页的段子
    147                 self.getOneStory()
    148     def getAJoke(self):
    149         self.enable = True
    150         self.loadPage()
    151         self.pageStories = self.stories[0]
    152         del self.stories[0]
    153         len_page = len(self.pageStories)
    154         story = self.pageStories[random.randint(0, len_page-1)]
    155         self.enable = True
    156         return story[1]
    157 
    158         #print u'回车看下一个,Q退出'
    159         #input = raw_input()
    160         #     #每当输入回车一次,判断一下是否要加载新页面
    161         #     self.loadPage()
    162         #     #如果输入Q则程序结束
    163         #if input == "Q":
    164         #   self.enable = False
    165         #    return
    166         #print len(story)
    167         #print u'%s' %story[1]
    168 
    169 #spider = QSBK()
    170 #Aspider.start()
    171 #print spider.getAJoke()

    使用方法如下:

     1 #-*- coding:utf-8 -*-
     2 
     3 
     4 #!/usr/bin/python
     5 __author__ = 'Leon'
     6 
     7 '''
     8     Author : Leon
     9     Email  : yangli0534@yahoo.com
    10     Description: 1 grab a joke from the Internet 
    11                  2 email to someone on schedule 
    12                  3 参考了部分网友的代码 ,感谢。侵权删
    13 '''
    14 import smtplib
    15 from email.MIMEMultipart import MIMEMultipart
    16 from email.MIMEText import MIMEText
    17 import re
    18 import schedule
    19 import time
    20 import datetime
    21 from qiushibaike import qiushibaike
    22 
    23 def job():
    24     #global myRandomJoke
    25     global myQiuBai
    26     #global server
    27     global toaddr
    28     global fromaddr
    29     global password
    30     t = datetime.datetime.now()
    31     content = ''
    32     content = content+ u'笑口常开!'
    33     content = content+ u"It's "
    34     content = content+ t.strftime("%A, %d. %B %Y %I:%M%p")+'
    '
    35     #myRandomJoke.setContent()
    36     #myRandomJoke.printAJoke(2)
    37     #content = content + myRandomJoke.getAJoke(2)
    38     content = content + myQiuBai.getAJoke()
    39     print content
    40     #content = u'''你好,这是一封测试邮件,来自yangli0534@yahooc.com'''
    41     #content = content +t.strftime("%A, %d. %B %Y %I:%M%p")
    42     msg = MIMEMultipart()
    43     msg['From'] =fromaddr
    44     msg['To'] =','.join(toaddr)
    45     msg['Cc'] = ','.join(ccaddr)
    46     msg['Bcc'] = ','.join(ccaddr)
    47     msg['Subject'] = u"Leon send a joke to you on"+t.strftime("%A, %d. %B %Y %I:%M%p")
    48     
    49     try:
    50         #body = "YOUR MESSAGE HERE"
    51         body = content
    52         #msg.attach(MIMEText(content, 'plain'))
    53         #msg.attach(MIMEText(content, 'plain'))
    54         #msg.attach(MIMEText(content,format,'utf-8'))
    55         msg.attach(MIMEText(body.encode('gbk')))
    56         text = msg.as_string()
    57         #server = smtplib.SMTP_SSL("smtp.126.com", 25)# connect to email server
    58         server = smtplib.SMTP("smtp.139.com",25)# connect to email server
    59         server.login(fromaddr,password)
    60         #server.sendmail(fromaddr, toaddr, text)
    61         server.sendmail(fromaddr, toaddr + ccaddr, text)
    62         #server.sendmail(fromaddr, fromaddr,text)
    63         server.quit()
    64         print "send email successfully"
    65     except:
    66         print "failed!"
    67 
    68 toaddr = ['1184802734@qq.com','18811007706@139.com'] # email address to send
    69 ccaddr = ['502327976@qq.com'] # carbon copy
    70 bccaddr = ['15210579762@139.com']#blind carbon copy
    71 #toaddr2 = '502327976@qq.com'
    72 fromaddr = 'china__mobile@139.com'#send address
    73 password = "xxxxxxx"#password
    74 #server = smtplib.SMTP('smtp.yahoo.com', 587, None, 30)
    75 #server = smtplib.SMTP_SSL('smtp.googlemail.com', 465)
    76 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
    77 
    78 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
    79 #server.login(fromaddr,password)
    80 #myRandomJoke = randomJoke()
    81 myQiuBai = qiushibaike() # 
    82 job()
    83 schedule.every(2).minutes.do(job)# send a email every 2 minutes
    84 #notQuit = True
    85 #print u"你好,这里是随机笑话!"
    86 while True:
    87     schedule.run_pending()# 
    88     time.sleep(10)
    89     
    90 
    91 
    92 server.quit()
    93 quit()
  • 相关阅读:
    poj1273
    JavaSE入门学习23:Java面向对象之构造方法
    NOI 2015 滞后赛解题报告
    LuaInterface简单介绍
    解决在onCreate()过程中获取View的width和Height为0的4种方法
    函数指针和指针函数
    Quartz-中断正在执行的任务
    servlet3.0获取参数与文件上传代码示例
    Servlet学习:(三)Servlet3.0 上传文件
    layui 批量上传文件 + 后台 用servlet3.0接收【我】
  • 原文地址:https://www.cnblogs.com/hiramlee0534/p/7191179.html
Copyright © 2020-2023  润新知