昨晚心血来潮,尝试用python写了一个ftp文件传输服务,可以接收指令,从远程ftp服务器同步指定目录数据,最后没用上,开源出来。
https://github.com/jadepeng/ftp_transfer_service.git
运行原理
- 'task_server' 是一个web服务器,可以接收传入任务,接收到任务后,将task写入mysql
- 启动任务后,'task_server'会扫描ftp文件列表,写入redis队列
transfer_client
是传输执行程序,可以多点部署,该程序会读取redis队列,进行文件下载
使用
配置
修改 .env
文件, 配置mysql和redis地址
REDIS_SERVER=""
REDIS_PORT=6380
REDIS_PASSWORD=""
MYSQL_HOST=""
MYSQL_PORT=3306
MYSQL_PASSWORD=""
MYSQL_USER=""
MYSQL_DB=""
启动服务
server 端
python3 task_server.py
传输端,可以部署多个
python3 transfer_client.py
接收任务
POST /task/
{
"taskId": "9",
"serverPath": "/weblog",
"storagePath": "/data",
"host": "ftpServer",
"port": 21,
"user": "user",
"password": "password"
}
启动传输
GET /task/{taskId}/start
查看进度
GET /task/{taskId}/progress
实现简介
第一次用fastapi来写web服务,这里记录下有意思的地方。
配置
可以通过配置类实现app的配置参数,pydantic还可以加载env文件更新配置
setting.py
from pydantic import BaseSettings
class APISettings(BaseSettings):
mysql_host: str = "127.0.0.1"
mysql_port: int = 3306
mysql_password: str
mysql_user: str
mysql_db: str
redis_server: str = "127.0.0.1"
redis_port: int = 6380
redis_password: str
max_wait_time_count: int = 10
class Config:
env_file = ".env"
env_file_encoding = 'utf-8'
redis 队列
通过list实现队列,rpush,blpop
import redis
class RedisQueue(object):
def __init__(self, name, namespace='queue', **redis_kwargs):
self.__db= redis.Redis(**redis_kwargs)
self.key = '%s:%s' %(namespace, name)
def qsize(self):
return self.__db.llen(self.key) # 返回队列里面list内元素的数量
def put(self, item):
self.__db.rpush(self.key, item) # 添加新元素到队列最右方
def get_wait(self, timeout=None):
item = self.__db.blpop(self.key, timeout=timeout)
return item
def get_nowait(self):
item = self.__db.lpop(self.key)
return item
redis BloomFilter
BloomFilter 可以用来去重
import mmh3
import redis
class BloomFilter(object):
def __init__(self, bf_key, bit_size=2000000, hash_count=4, start_seed=41, **redis_kwargs):
self.bit_size = bit_size
self.hash_count = hash_count
self.start_seed = start_seed
self.client = redis.Redis(**redis_kwargs)
self.bf_key = bf_key
def add(self, data):
bit_points = self._get_hash_points(data)
for index in bit_points:
self.client.setbit(self.bf_key, index, 1)
def madd(self, m_data):
if isinstance(m_data, list):
for data in m_data:
self.add(data)
else:
self.add(m_data)
def exists(self, data):
bit_points = self._get_hash_points(data)
result = [
self.client.getbit(self.bf_key, index) for index in bit_points
]
return all(result)
def mexists(self, m_data):
result = {}
if isinstance(m_data, list):
for data in m_data:
result[data] = self.exists(data)
else:
result[m_data] = self.exists[m_data]
return result
def _get_hash_points(self, data):
return [
mmh3.hash(data, index) % self.bit_size
for index in range(self.start_seed, self.start_seed +
self.hash_count)
]
python的orm框架sqlalchemy
sqlalchemy 需要先定义ORM类
class TransferTask(Base):
__tablename__ = 'transfer_task'
taskId = Column(String(255), primary_key=True, index=True)
serverPath = Column(String(255), nullable=False)
storagePath = Column(String(255), nullable=False)
host = Column(String(255), nullable=False)
port = Column(Integer, nullable=False)
user = Column(String(255), nullable=False)
password = Column(String(255), nullable=False)
time = Column(DateTime, nullable=False, default=datetime.now)
class TransferFailedFile(Base):
__tablename__ = 'transfer_failed_file'
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
taskId = Column(String(255), index=True)
filePath = Column(String(1024), nullable=False)
time = Column(DateTime, nullable=False, default=datetime.now)
class TransferProgress(Base):
__tablename__ = 'transfer_task_progress'
taskId = Column(String(255), primary_key=True, index=True)
total = Column(Integer, nullable=False)
status = Column(Integer, nullable=False)
finished = Column(Integer, nullable=False)
failed = Column(Integer, nullable=False)
time = Column(DateTime, nullable=False, default=datetime.now)
if __name__ == '__main__':
settings = APISettings()
db = Database(settings.mysql_host, settings.mysql_port, settings.mysql_user, settings.mysql_password,
settings.mysql_db)
Base.metadata.create_all(db.engine)
使用了sqlalchemy CRUD就比较方便了, 可以通过query,filter来查询和过滤
def get_or_create_progress(self, task: TransferTask):
db = self.database.get_session()
dbitem = db.query(TransferProgress).filter(TransferProgress.taskId == task.taskId).first()
if not dbitem:
dbitem = TransferProgress()
dbitem.taskId = task.taskId
dbitem.total = 0
dbitem.status = TaskStatus.SCANNING.value
dbitem.finished = 0
dbitem.failed = 0
db.add(dbitem)
db.commit()
return dbitem
这里需要注意的是,session需要close,不然session过多会报错,可以封装一个get_session,利用yield来自动释放
见database.py
def get_db(self):
db = self.SessionLocal()
try:
yield db
finally:
db.close()
def get_session(self):
return next(self.get_db())
python ftp操作
python有个ftplib,可以用来操作ftp,这里简单封装一个client类, 实现listfiles和下载文件
import ftplib
import os
from datetime import datetime
import ntpath
class FtpClient:
def __init__(self, host: str, port: int, user: str, password: str):
self.host = host
self.port = port
self.user = user
self.password = password
self.connect()
def connect(self):
self.ftp = ftplib.FTP()
self.ftp.connect(host=self.host, port=self.port)
self.ftp.login(self.user, self.password)
self.ftp.encoding = "utf-8"
def list_files(self, dir):
self.ftp.cwd(dir)
for file_data in self.ftp.mlsd():
file_name, meta = file_data
file_type = meta.get("type")
if file_type == "file":
try:
self.ftp.voidcmd("TYPE I")
file_size = self.ftp.size(file_name)
yield f"{dir}/{file_name}", file_size
except Exception as e:
print(e)
else:
yield from self.list_files(dir + "/" + file_name)
def download_file(self, file_name:str, local_file_name:str):
try:
self.ftp.retrbinary('RETR %s' % file_name, open(local_file_name, 'wb').write)
except ftplib.error_perm:
print('ERROR: cannot read file "%s"' % file_name)
os.unlink(local_file_name)
下载程序
作为redis mq的消费者,要考虑的是下载失败了如何处理,异常退出如何处理?进度如何更新?
针对异常退出,这里用一个简单的方案,获取mq消息后,先将item写入到本地文件,这样如果client程序异常退出,下次进来还能继续
针对下载失败,这里失败后先重新放入队列,retryCount+1,如果超过最大重试次数,则写到错误记录。
进度更新,则依靠update+1执行。
def transfer_task_item(ftp, local_path, queque, task, task_item):
try:
local_file_name = local_path + task_item['fileName']
print("transfer %s to %s" % (task_item['fileName'], local_file_name))
# 文件已存在
if os.path.exists(local_file_name):
# 比较大小
size = os.path.getsize(local_file_name)
if size == task_item["fileSize"]:
db_service.update_finished(task.taskId, 1)
return
dir = os.path.abspath(os.path.dirname(local_file_name))
os.makedirs(dir, exist_ok=True)
ftp.download_file(task_item['fileName'], local_file_name)
# 更新进度
db_service.update_finished(task.taskId, 1)
except Exception as e:
print(e)
if task_item['retryCount'] < 3:
task_item['retryCount'] = task_item['retryCount'] + 1
queque.put(json.dumps(task_item))
else:
print(task_item['fileName'] + " transfer failed with max_retry_count")
db_service.add_failed_file(task.taskId, task_item['fileName'])
finally:
remove_lock()