    下面提到的所有实验结果,都是在配备Intel i7-4770HQ CPU的2015款15寸MacBook上执行的。对于所有x86 CPU,缓存线大小应为64字节。

    Preparation and Baseline


    import tvm

    import tvm.testing

    from tvm import te

    import numpy

    import timeit


    # The size of the matrix

    # (M, K) x (K, N)

    # You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.

    M = 1024

    K = 1024

    N = 1024


    # The default tensor type in tvm

    dtype = "float32"


    # using Intel AVX2(Advanced Vector Extensions) ISA for SIMD

    # To get the best performance, please change the following line

    # to llvm -mcpu=core-avx2, or specific type of CPU you use

    target = "llvm"

    ctx = tvm.context(target, 0)


    # Random generated tensor for testing

    a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx)

    b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx)


    np_repeat = 100

    np_runing_time = timeit.timeit(

        setup="import numpy "

        "M = " + str(M) + " "

        "K = " + str(K) + " "

        "N = " + str(N) + " "

        'dtype = "float32" '

        "a = numpy.random.rand(M, K).astype(dtype) "

        "b = numpy.random.rand(K, N).astype(dtype) ",

        stmt="answer = numpy.dot(a, b)",



    print("Numpy running time: %f" % (np_runing_time / np_repeat))


    answer = numpy.dot(a.asnumpy(), b.asnumpy())


    # Algorithm

    k = te.reduce_axis((0, K), "k")

    A = te.placeholder((M, K), name="A")

    B = te.placeholder((K, N), name="B")

    C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")


    # Default schedule

    s = te.create_schedule(C.op)

    func = tvm.build(s, [A, B, C], target=target, name="mmult")

    assert func


    c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)

    func(a, b, c)

    tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)


    evaluator = func.time_evaluator(func.entry_name, ctx, number=1)

    print("Baseline: %f" % evaluator(a, b, c).mean)


    Numpy running time: 0.006963

    Baseline: 3.516655

    In TVM, we can always inspect lower level IR to debug or optimize our schedule. Here is the generated IR using our baseline schedule.

    print(tvm.lower(s, [A, B, C], simple_mode=True))


    primfn(A_1: handle, B_1: handle, C_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),

                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),

                 A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}

      buffer_map = {A_1: A, B_1: B, C_1: C} {

      for (x: int32, 0, 1024) {

        for (y: int32, 0, 1024) {

          C_2[((x*1024) + y)] = 0f32

          for (k: int32, 0, 1024) {

            C_2[((x*1024) + y)] = ((float32*)C_2[((x*1024) + y)] + ((float32*)A_2[((x*1024) + k)]*(float32*)B_2[((k*1024) + y)]))







    bn = 32

    s = te.create_schedule(C.op)


    # Blocking by loop tiling

    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)

    (k,) = s[C].op.reduce_axis

    ko, ki = s[C].split(k, factor=4)


    # Hoist reduction domain outside the blocking loop

    s[C].reorder(xo, yo, ko, ki, xi, yi)


    func = tvm.build(s, [A, B, C], target=target, name="mmult")

    assert func


    c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)

    func(a, b, c)

    tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)


    # By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,

    # we can see big speedup compared with the baseline.

    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)

    print("Opt1: %f" % evaluator(a, b, c).mean)


    Opt1: 0.284967

    Here is the generated IR after blocking.

    print(tvm.lower(s, [A, B, C], simple_mode=True))


    primfn(A_1: handle, B_1: handle, C_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),

                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),

                 A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}

      buffer_map = {A_1: A, B_1: B, C_1: C} {

      for (x.outer: int32, 0, 32) {

        for (y.outer: int32, 0, 32) {

          for (x.inner.init: int32, 0, 32) {

            for (y.inner.init: int32, 0, 32) {

              C_2[((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)) + y.inner.init)] = 0f32



          for (k.outer: int32, 0, 256) {

            for (k.inner: int32, 0, 4) {

              for (x.inner: int32, 0, 32) {

                for (y.inner: int32, 0, 32) {

                  C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] = ((float32*)C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] + ((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)]*(float32*)B_2[((((k.outer*4096) + (k.inner*1024)) + (y.outer*32)) + y.inner)]))











    s = te.create_schedule(C.op)

    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)

    (k,) = s[C].op.reduce_axis

    ko, ki = s[C].split(k, factor=4)


    s[C].reorder(xo, yo, ko, ki, xi, yi)


    # Vectorization



    func = tvm.build(s, [A, B, C], target=target, name="mmult")

    assert func


    c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)

    func(a, b, c)

    tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)


    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)

    print("Opt2: %f" % evaluator(a, b, c).mean)


    Opt2: 0.321595

    Here is the generated IR after vectorization.

    print(tvm.lower(s, [A, B, C], simple_mode=True))


    primfn(A_1: handle, B_1: handle, C_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),

                 B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),

                 A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}

      buffer_map = {A_1: A, B_1: B, C_1: C} {

      for (x.outer: int32, 0, 32) {

        for (y.outer: int32, 0, 32) {

          for (x.inner.init: int32, 0, 32) {

            C_2[ramp((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)), 1, 32)] = broadcast(0f32, 32)


          for (k.outer: int32, 0, 256) {

            for (k.inner: int32, 0, 4) {

              for (x.inner: int32, 0, 32) {

                C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] = ((float32x32*)C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] + (broadcast((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)], 32)*(float32x32*)B_2[ramp((((k.outer*4096) + (k.inner*1024)) + (y.outer*32)), 1, 32)]))







    Loop Permutation


    s = te.create_schedule(C.op)

    xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)

    (k,) = s[C].op.reduce_axis

    ko, ki = s[C].split(k, factor=4)


    # re-ordering

    s[C].reorder(xo, yo, ko, xi, ki, yi)



    func = tvm.build(s, [A, B, C], target=target, name="mmult")

    assert func


    c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)

    func(a, b, c)

    tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)


    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)

    print("Opt3: %f" % evaluator(a, b, c).mean)


    Opt3: 0.111657

    Here is the generated IR after loop permutation.

    print(tvm.lower(s, [A, B, C], simple_mode=True))


    primfn(A_1: handle, B_1: handle, C_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], []),

                 C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),

                 A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], [])}

      buffer_map = {A_1: A, B_1: B, C_1: C} {

      for (x.outer: int32, 0, 32) {

        for (y.outer: int32, 0, 32) {

          for (x.inner.init: int32, 0, 32) {

            C_2[ramp((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)), 1, 32)] = broadcast(0f32, 32)


          for (k.outer: int32, 0, 256) {

            for (x.inner: int32, 0, 32) {

              for (k.inner: int32, 0, 4) {

                C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] = ((float32x32*)C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] + (broadcast((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)], 32)*(float32x32*)B_2[ramp((((k.outer*4096) + (k.inner*1024)) + (y.outer*32)), 1, 32)]))








