import re
from urllib import request
# 挖掘邮箱
def getEmailsByLine(url):
"""按行提取邮箱"""
emailregex = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)", re.IGNORECASE)#忽略异常情况和大小写
for line in request.urlopen(url):
emaillist = emailregex.findall(line.decode("utf-8"))
if emaillist:#不为空
print(emaillist)
def getEmailsByAll(url):
"""一次读取,一次提取所有邮箱"""
emailregex = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)", re.IGNORECASE)#忽略异常情况和大小写
html = request.urlopen(url).read().decode("utf-8")
emaillist = emailregex.findall(html)
if emaillist:
print(emaillist)
if __name__ == "__main__":
url = "http://bbs.tianya.cn/post-140-393973-1.shtml"
# getEmailsByLine(url)
getEmailsByAll(url)