问题描述
在微软云环境中,使用python SDK连接存储账号(Storage Account)需要计算Blob大小?虽然Azure提供了一个专用工具Azure Storage Explorer可以统计出Blob的大小:
但是它也是只能一个Blob Container一个的统计,如果Container数量巨大,这将是一个繁琐的工作。而作为开发者,应该让代码来帮助完成。下文使用最快上手的Python代码来计算Blob中容量的大小。
完整代码
import os, uuid, datetime, threading import logging from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__ def calculateBlob(connect_string, count): try: blob_service_client = BlobServiceClient.from_connection_string(connect_string) except Exception as e: messages = str(count) + "Connect_String Error, Messages:" + e.args.__str__() print(messages) logging.info(messages) else: all_containers = blob_service_client.list_containers() for c in all_containers: count_name = c.name print(count_name) if count_name not in blobSize_Total: blobSize_Total[count_name] = 0 if count_name not in blobSize_Daily: blobSize_Daily[count_name] = 0 container_client = blob_service_client.get_container_client(count_name) generator = container_client.list_blobs() total_size_container = 0 daily_size_container = 0 for blob in generator: total_size_container += blob.size blob_create_time = blob.creation_time.strftime("%Y%m%d") if blob_create_time != now_date: continue else: # Calculate BlobSize in this month daily_size_container += blob.size # blobSize_Daily[count_name] += blob.size # /(1024*1024) # content_length - bytes blobSize_Total[count_name] += total_size_container / (1024 * 1024) blobSize_Daily[count_name] += daily_size_container / (1024 * 1024) return None if __name__ == '__main__': # connect string Connection_String_List ="DefaultEndpointsProtocol=https;AccountName=<storagename>;AccountKey=<key>;EndpointSuffix=core.chinacloudapi.cn" # for i in Connection_String: start = datetime.datetime.now() print(start) # 定义全局变量 - blobSize_Daily & blobSize_Total blobSize_Daily = {} blobSize_Total = {} now_date = datetime.datetime.now().strftime("%Y%m%d") print("开始计算") calculateBlob(Connection_String_List, 1) print("计算完成") print("统计当前新增大小") print(blobSize_Daily) print("统计Blob总大小") print(blobSize_Total) end = datetime.datetime.now() print(end)
如运行是没有Azure blob模块,可以使用 pip install azure-storage-blob 安装。以上代码运行结果如下:
如果有多个Storage Account,可以考虑加入多线程的方式来运行,在代码中增加一个myThread类,然后在 __main__ 中把 calculateBlob(Connection_String_List, 1) 运行替换为 many_thread(Connection_String_List) 即可。
class myThread(threading.Thread): def __init__(self, threadID, name, connection_string): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.connection_string = connection_string def run(self): print("开始线程:" + self.name) calculateBlob(self.connection_string, self.threadID) print("退出线程:" + self.name) def many_thread(Connection_String_List): threads = [] for i in range(len(Connection_String_List)): # 循环创建多个线程 t = myThread(i, "Thread-" + str(i), Connection_String_List[i]) threads.append(t) for t in threads: # 循环启动线程 一个线程对应一个连接字符串 t.start() for t in threads: t.join()
遇见问题
在多线程执行时,可能会遇见问题:("Connection broken: ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)", ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)),出现此问题大都是由于客户端使用了已经断开的连接导致所导致的。所以一定要仔细调试多线程关闭代码。是否是把还需要运行的线程给关闭了。导致了以上的错误消息。
附录一:多线程计算Blob的完整代码
import os, uuid, datetime, threading import logging from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__ def calculateBlob(connect_string, count): try: blob_service_client = BlobServiceClient.from_connection_string(connect_string) except Exception as e: messages = str(count) + "Connect_String Error, Messages:" + e.args.__str__() print(messages) logging.info(messages) else: all_containers = blob_service_client.list_containers() for c in all_containers: count_name = c.name print(count_name) if count_name not in blobSize_Total: blobSize_Total[count_name] = 0 if count_name not in blobSize_Daily: blobSize_Daily[count_name] = 0 container_client = blob_service_client.get_container_client(count_name) generator = container_client.list_blobs() total_size_container = 0 daily_size_container = 0 for blob in generator: total_size_container += blob.size blob_create_time = blob.creation_time.strftime("%Y%m%d") if blob_create_time != now_date: continue else: # Calculate BlobSize in this month daily_size_container += blob.size # blobSize_Daily[count_name] += blob.size # /(1024*1024) # content_length - bytes blobSize_Total[count_name] += total_size_container / (1024 * 1024) blobSize_Daily[count_name] += daily_size_container / (1024 * 1024) return None class myThread(threading.Thread): def __init__(self, threadID, name, connection_string): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.connection_string = connection_string def run(self): print("开始线程:" + self.name) calculateBlob(self.connection_string, self.threadID) print("退出线程:" + self.name) def many_thread(Connection_String_List): threads = [] for i in range(len(Connection_String_List)): # 循环创建多个个线程 t = myThread(i, "Thread-" + str(i), Connection_String_List[i]) threads.append(t) for t in threads: # 循环启动线程 - 一个线程对应一个连接字符串 t.start() for t in threads: t.join() if __name__ == '__main__': # connect string Connection_String_List = ['DefaultEndpointsProtocol=https;AccountName=<your storage account 1>;AccountKey=<Key 1>;EndpointSuffix=core.chinacloudapi.cn', 'DefaultEndpointsProtocol=https;AccountName=<your storage account 2>;AccountKey=<Key 2>;EndpointSuffix=core.chinacloudapi.cn'] # for i in Connection_String: start = datetime.datetime.now() print(start) # 定义全局变量 - blobSize_Daily & blobSize_Total blobSize_Daily = {} blobSize_Total = {} now_date = datetime.datetime.now().strftime("%Y%m%d") many_thread(Connection_String_List) print("Main Thread End") print(blobSize_Daily) print(blobSize_Total) end = datetime.datetime.now() print(end)
运行效果:
参考资料
快速入门:使用 Python v12 SDK 管理 blob :https://docs.azure.cn/zh-cn/storage/blobs/storage-quickstart-blobs-python
Python 列表(List) : https://www.runoob.com/python/python-lists.html
BlobServiceClient Class : https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python