背景:
大量falcon 监控数据打到kinesis,然后把kinesis内的数据以json格式实时备份到s3上(临时备份),为了降低成本,减少S3空间占用以及后期数据分析,计划把s3上的json文件转储成parquet文件。
使用服务:glue(脚本作业)+lambda(定时触发glue脚本)+cloudwatch events(定时器)
glue脚本:
import sys from awsglue.transforms import * from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.utils import getResolvedOptions from pyspark.sql.types import TimestampType,DateType from awsglue.job import Job import boto3 import datetime import time import logging logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) start_hour_before = 1 end_hour_before = 1 source_bucket_name = "backuptemp" target_bucket_name = "kinesis" target_prefix_name = "parquet_alertlog" delimiter = "/" default_region = "us-west-2" crawler_name = "clean_alertlog" client = boto3.client('s3') def delete_object(bucket_name,key_name): try: response = client.delete_object(Bucket=bucket_name,Key=key_name) except Exception as e: print str(e) #email_alert("error when delete_object %s/%s" % (bucket_name, key_name)) def aggragate_files(date,key_name): logger.info("start aggragate %s, time is %s." % (key_name, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))) if key_name == "": return try: dataframe = spark.read.json("s3://%s/%s" % (source_bucket_name,key_name)) print("dataframe.....................",dataframe) dataframe.write.parquet("s3://%s/%s/dt=%s" % (target_bucket_name, target_prefix_name, date), mode="append") logger.info("finish aggragate %s, time is %s." % (key_name, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))) except Exception as e: #email_alert("error when aggragate %s/%s/%s: %s." % (key_name, date, hour, str(e))) print str(e) else: delete_object(source_bucket_name,key_name) def main(): s3 = boto3.resource('s3') process_slot = datetime.datetime.now() - datetime.timedelta(days=start_hour_before) bucket = s3.Bucket(source_bucket_name) dt = process_slot.strftime("%Y-%m-%d") for obj in bucket.objects.all(): aggragate_files(dt,obj.key) main() ####commit job job.commit()
注释:
1、循环s3桶中对象文件,迭代转储为parquet文件,转储成功后删除原json文件
2、保存parquet文件时,采用mode="append"模式。