• 从GoogleClusterData统计每个用户的使用率、平均每次出价


        之前将google cluster data导入了Azure上的MySQL数据库,下一步就是对这些数据进行分析,

    挖掘用户的使用规律了。

    首先,为了加快执行速度,对user,time等加入索引。

    然后就可以使用以下代码进行统计了。

    import os
    import MySQLdb
    import time
    import thread
    
    def use4ADay(day, users):
        conn=MySQLdb.connect(host="localhost",user="root",passwd="123456",db="googleclusterdata",charset="utf8")
        cursor = conn.cursor()
        
        msAday = 24*60*60*1000000
        
        for user in users:
            user = user[0]
            print user
            use4ADay.user = user
            
            print 'day %s' %day
            startTime = (day - 1) * msAday
            endTime = day * msAday
            dayCPUUse = 0
            dayMEMUse = 0
            dayDiskUse = 0
            order = "select job_id from job_events where time >= %s and time < %s and user = '%s'" %(startTime, endTime, user)
            print order
            cursor.execute(order)
            job_ids = cursor.fetchall()
            for job_id in job_ids:
                job_id = job_id[0]
                print 'day %s' %day
                order = "select task_index, event_type, cpu_request, memory_request, disk_space_request, time from task_events 
        where time >= %s and time < %s and job_id = %d order by task_index"
                        %(startTime, endTime, job_id)
                print order
                cursor.execute(order)
                tasks = cursor.fetchall()
                print 'tasks get'
                i = 0
                while i < len(tasks) - 1:
                    task = tasks[i]
                    if task[1] == 1:
                        task_index = task[0]
                        nextEvent = tasks[i+1]
                        if (nextEvent[1] == 4 or nextEvent[1] == 5) and nextEvent[0] == task_index:
                            taskLife = (nextEvent[5] - tasks[i][5]) / (10.0**6)
                            dayCPUUse += taskLife * task[2]
                            dayMEMUse += taskLife * task[3]
                            dayDiskUse += taskLife * task[4]
                            #print 'task: ', task_index, dayCPUUse, dayMEMUse, dayDiskUse
                    i = i+1
                #print 'job: ', job_id, dayCPUUse, dayMEMUse, dayDiskUse
            fOut = open('C:\userUsageEachDay\day%d.txt' %day, 'a')
            fOut.write('%s	%f	%f	%f
    ' %(user,  dayCPUUse, dayMEMUse, dayDiskUse))
            fOut.close()
        print 'day %d finish' %day
        conn.close()
    
        
    conn=MySQLdb.connect(host="localhost",user="root",passwd="123456",db="googleclusterdata",charset="utf8")
    cursor = conn.cursor()
    #get all user_name
    order = "select distinct user from job_events"
    print order
    cursor.execute(order)
    users = cursor.fetchall()
    conn.close()
    
    for day in range(1, 30):
        try:
            use4ADay(day, users)
        except:
            print 'day', day, 'failed!!'
            fOut = open('C:\failed.txt', 'a')
            fOut.write('%s	%d	
    ' %(use4ADay.user, day))
            fOut.close()
        #print 'starting thread for day %d' %day
        #thread.start_new_thread(use4ADay, (day, users, ) )#use4ADay(2, users)

    下一步,是统计每个用户整个月的消费频率,以及每次消费的平均消费量

    fDay1 = open('C:\Usage\day1.txt')
    users = []
    for l in fDay1.readlines():
        l = l.split('	')
        user = l[0]
        users.append(user)
    fDay1.close()
    
    #fOut = open('C:\UseTraceOfAllUsers.txt', 'w')
    for user in users:
        useDays = 0
        allPrice = 0
        for day in range(1,30):
            f = open('C:\Usage\day%d.txt' %day)
            isFind = False
            for l in f.readlines():
                if l.count(user) > 0:
                    l = l.strip()
                    l = l.split('	')
                    cpu = float(l[1])
                    mem = float(l[2])
                    disk = float(l[3])
                    money = 1.92*cpu + 15.6*mem + 1.2*disk
                    assert(money>=0)
                    isFind = True
                    break
            if isFind and money != 0:
                useDays += 1
                allPrice += money
            f.close()
        if useDays != 0:
            pass
            #fOut.write('%s	%s
    ' %(str(useDays/29.0), str(allPrice/useDays)))
    fOut.close()

    最后就可以使用matlab进行画图啦。

    x = load('C:UseTraceOfAllUsers.txt')
    plot(x(:,1), x(:,2), 'o');

    结果如下:

    对平均使用量取个对数的话

    x = load('C:UseTraceOfAllUsers.txt')
    plot(x(:,1), log(x(:,2)), 'o');

  • 相关阅读:
    Linux设备树(四 中断)
    Linux设备树(三 属性)
    Linux设备树(二 节点)
    责任链设计模式
    获取服务器内存和可用线程
    秒杀抢购思路解析
    Hystrix 用法及注解用法
    object is not an instance of declaring class
    sqlserver 截取字符串
    sqlserver 转化函数
  • 原文地址:https://www.cnblogs.com/instant7/p/4189238.html
Copyright © 2020-2023  润新知