OpenI
/
varec-cc

 
			
							# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


from __future__ import absolute_import, print_function

import os
import tvm
import varec
import numpy as np

from tvm import rpc
from tvm.contrib import util
from varec.testing import simulator
from varec.top import varec_compute as cp
from varec.top import varec_debug as db
from varec.top import varec_schedule as sche

# Load varec parameters from the varec/config/varec_config.json file
env = varec.get_env()

remote = rpc.LocalSession()

######################################################################
# Computation Declaration
# -----------------------
# Large LeNet-5-like net (16 channels)

import topi

stage0_batch_size = 1
stage0_height = 32
stage0_width = 32
stage0_in_channels = 16
stage0_out_channels = 16 * 6
stage0_kernel_h = 5
stage0_kernel_w = 5
stage0_pad_h = 0
stage0_pad_w = 0
stage0_stride_h = 1
stage0_stride_w = 1
assert stage0_batch_size % env.BATCH == 0
assert stage0_in_channels % env.BLOCK_IN == 0
assert stage0_out_channels % env.BLOCK_OUT == 0

# Input feature map: (N, IC, H, W, n, ic)
stage0_data_shape = (stage0_batch_size // env.BATCH,
              stage0_in_channels // env.BLOCK_IN,
              stage0_height,
              stage0_width,
              env.BATCH,
              env.BLOCK_IN)
# Kernel: (OC, IC, H, W, oc, ic)
stage0_kernel_shape = (stage0_out_channels // env.BLOCK_OUT,
                stage0_in_channels // env.BLOCK_IN,
                stage0_kernel_h,
                stage0_kernel_w,
                env.BLOCK_OUT,
                env.BLOCK_IN)
# Derive output feature map dimensions
stage0_fout_height = (stage0_height + 2 * stage0_pad_h - stage0_kernel_h) // stage0_stride_h + 1
stage0_fout_width = (stage0_width + 2 * stage0_pad_w - stage0_kernel_w) // stage0_stride_w + 1

# Conv feature map: (N, OC, H, W, n, oc)
stage0_conv_shape = (stage0_batch_size // env.BATCH,
                stage0_out_channels // env.BLOCK_OUT,
                stage0_fout_height,
                stage0_fout_width,
                env.BATCH,
                env.BLOCK_OUT)

# Output feature map: (N, OC, H, W, n, oc)
stage0_output_shape = (stage0_batch_size // env.BATCH,
                stage0_out_channels // env.BLOCK_OUT,
                stage0_fout_height // 2,
                stage0_fout_width // 2,
                env.BATCH,
                env.BLOCK_OUT)

# Convolution reduction axes
stage0_dy = tvm.reduce_axis((0, stage0_kernel_h), name='dy')
stage0_dx = tvm.reduce_axis((0, stage0_kernel_w), name='dx')
stage0_ic = tvm.reduce_axis((0, stage0_in_channels // env.BLOCK_IN), name='ic')
stage0_ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')

inp_min = -(1 << (env.INP_WIDTH - 1))
inp_max = (1 << (env.INP_WIDTH - 1)) - 1

pool_x, pool_y = 2, 2
pool_type = "max"

# Parameters
stage0_conv_paras = stage0_dy, stage0_dx, stage0_ic, stage0_ic_tns, stage0_stride_h, stage0_stride_w
stage0_data_paras = stage0_pad_h, stage0_pad_w
stage0_pool_paras = pool_type, pool_x, pool_y

# Generate compute graph with varec libs

# Conv
stage0_data_buf, stage0_data, stage0_kernel_buf, stage0_kernel, stage0_conv =\
     cp.compute_conv(stage0_data_shape, stage0_kernel_shape, stage0_conv_shape,\
           stage0_conv_paras, stage0_data_paras, env)

# Shift and Clip
stage0_sc = cp.compute_shr_clip(stage0_conv_shape, stage0_conv, inp_min, inp_max)

# Relu
stage0_relu = cp.relu(stage0_sc[-1])

# Pool
stage0_pools, stage0_output_buf, stage0_output_ph = \
     cp.compute_pool(stage0_output_shape, stage0_relu, stage0_pool_paras, env)

# Result Tensor
stage0 = tvm.compute(stage0_output_shape,
                  lambda *i: stage0_pools[-1](*i).astype(env.inp_dtype),
                  name="stage0")


##################################################
# Stage 1

stage1_batch_size = 1
stage1_height = 14
stage1_width = 14
stage1_in_channels = 16 * 6
stage1_out_channels = 16 * 16
stage1_kernel_h = 5
stage1_kernel_w = 5
stage1_pad_h = 0
stage1_pad_w = 0
stage1_stride_h = 1
stage1_stride_w = 1
assert stage1_batch_size % env.BATCH == 0
assert stage1_in_channels % env.BLOCK_IN == 0
assert stage1_out_channels % env.BLOCK_OUT == 0

# Input feature map: (N, IC, H, W, n, ic)
stage1_data_shape = (stage1_batch_size // env.BATCH,
              stage1_in_channels // env.BLOCK_IN,
              stage1_height,
              stage1_width,
              env.BATCH,
              env.BLOCK_IN)
# Kernel: (OC, IC, H, W, oc, ic)
stage1_kernel_shape = (stage1_out_channels // env.BLOCK_OUT,
                stage1_in_channels // env.BLOCK_IN,
                stage1_kernel_h,
                stage1_kernel_w,
                env.BLOCK_OUT,
                env.BLOCK_IN)
# Derive output feature map dimensions
stage1_fout_height = (stage1_height + 2 * stage1_pad_h - stage1_kernel_h) // stage1_stride_h + 1
stage1_fout_width = (stage1_width + 2 * stage1_pad_w - stage1_kernel_w) // stage1_stride_w + 1

#print("fout!!!",stage0_fout_height,stage0_fout_width)
# Conv feature map: (N, OC, H, W, n, oc)
stage1_conv_shape = (stage1_batch_size // env.BATCH,
                stage1_out_channels // env.BLOCK_OUT,
                stage1_fout_height,
                stage1_fout_width,
                env.BATCH,
                env.BLOCK_OUT)

# Output feature map: (N, OC, H, W, n, oc)
stage1_output_shape = (stage1_batch_size // env.BATCH,
                stage1_out_channels // env.BLOCK_OUT,
                stage1_fout_height // 2,
                stage1_fout_width // 2,
                env.BATCH,
                env.BLOCK_OUT)

# Convolution reduction axes
stage1_dy = tvm.reduce_axis((0, stage1_kernel_h), name='dy')
stage1_dx = tvm.reduce_axis((0, stage1_kernel_w), name='dx')
stage1_ic = tvm.reduce_axis((0, stage1_in_channels // env.BLOCK_IN), name='ic')
stage1_ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')


# Parameters
stage1_conv_paras = stage1_dy, stage1_dx, stage1_ic, stage1_ic_tns, stage1_stride_h, stage1_stride_w
stage1_data_paras = stage1_pad_h, stage1_pad_w
stage1_pool_paras = pool_type, pool_x, pool_y


# Generate compute graph with varec libs

# Conv
stage1_data_buf, stage1_data, stage1_kernel_buf, stage1_kernel, stage1_conv =\
     cp.compute_conv(stage1_data_shape, stage1_kernel_shape, stage1_conv_shape,\
          stage1_conv_paras, stage1_data_paras, env, stage0)

# Shift and Clip
stage1_sc = cp.compute_shr_clip(stage1_conv_shape, stage1_conv, inp_min, inp_max)

# Relu
stage1_relu = cp.relu(stage1_sc[-1])

# Pool
stage1_pools, stage1_output_buf, stage1_output_ph =\
     cp.compute_pool(stage1_output_shape, stage1_relu, stage1_pool_paras, env)

# Result Tensor
stage1 = tvm.compute(stage1_output_shape,
                  lambda *i: stage1_pools[-1](*i).astype(env.inp_dtype),
                  name="stage1")


##################################################
# Stage 2

stage2_batch_size = 1
stage2_height = 5
stage2_width = 5
stage2_in_channels = 16 * 16
stage2_out_channels = 16 * 16
stage2_kernel_h = 5
stage2_kernel_w = 5
stage2_pad_h = 0
stage2_pad_w = 0
stage2_stride_h = 1
stage2_stride_w = 1
assert stage2_batch_size % env.BATCH == 0
assert stage2_in_channels % env.BLOCK_IN == 0
assert stage2_out_channels % env.BLOCK_OUT == 0

# Input feature map: (N, IC, H, W, n, ic)
stage2_data_shape = (stage2_batch_size // env.BATCH,
              stage2_in_channels // env.BLOCK_IN,
              stage2_height,
              stage2_width,
              env.BATCH,
              env.BLOCK_IN)
# Kernel: (OC, IC, H, W, oc, ic)
stage2_kernel_shape = (stage2_out_channels // env.BLOCK_OUT,
                stage2_in_channels // env.BLOCK_IN,
                stage2_kernel_h,
                stage2_kernel_w,
                env.BLOCK_OUT,
                env.BLOCK_IN)
# Derive output feature map dimensions
stage2_fout_height = (stage2_height + 2 * stage2_pad_h - stage2_kernel_h) // stage2_stride_h + 1
stage2_fout_width = (stage2_width + 2 * stage2_pad_w - stage2_kernel_w) // stage2_stride_w + 1

#print("fout!!!",stage0_fout_height,stage0_fout_width)
# Conv feature map: (N, OC, H, W, n, oc)
stage2_conv_shape = (stage2_batch_size // env.BATCH,
                stage2_out_channels // env.BLOCK_OUT,
                stage2_fout_height,
                stage2_fout_width,
                env.BATCH,
                env.BLOCK_OUT)

# Output feature map: (N, OC, H, W, n, oc)
stage2_output_shape = (stage2_batch_size // env.BATCH,
                stage2_out_channels // env.BLOCK_OUT,
                stage2_fout_height,
                stage2_fout_width,
                env.BATCH,
                env.BLOCK_OUT)

# Convolution reduction axes
stage2_dy = tvm.reduce_axis((0, stage2_kernel_h), name='dy')
stage2_dx = tvm.reduce_axis((0, stage2_kernel_w), name='dx')
stage2_ic = tvm.reduce_axis((0, stage2_in_channels // env.BLOCK_IN), name='ic')
stage2_ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')


# Parameters
stage2_conv_paras = stage2_dy, stage2_dx, stage2_ic, stage2_ic_tns, stage2_stride_h, stage2_stride_w
stage2_data_paras = stage2_pad_h, stage2_pad_w


# Generate compute graph with varec libs

# Conv
stage2_data_buf, stage2_data, stage2_kernel_buf, stage2_kernel, stage2_conv =\
     cp.compute_conv(stage2_data_shape, stage2_kernel_shape, stage2_conv_shape,\
          stage2_conv_paras, stage2_data_paras, env, stage1)

# Shift and Clip
stage2_sc = cp.compute_shr_clip(stage2_conv_shape, stage2_conv, inp_min, inp_max)

# Relu
stage2_relu = cp.relu(stage2_sc[-1])

# Result Tensor
stage2 = tvm.compute(stage2_output_shape,
                  lambda *i: stage2_relu(*i).astype(env.inp_dtype),
                  name="stage2")


######################################################################
# Scheduling the Computation

# Create TVM schedule
s = tvm.create_schedule(stage2.op)
# Let's look at the default TVM schedule
print(tvm.lower(s, [stage0_data, stage0_kernel, stage0_output_ph,\
     stage1_kernel, stage1_output_ph, stage2_kernel, stage2], simple_mode=True))
print("+++++++++++++++++")
######################################################################
# Blocking the Computation : Stage 0

# A feasible tiling sizes
# b_block = 1 // env.BATCH
# oc_block = 16 * 2// env.BLOCK_OUT 
# ic_block = 16 // env.BLOCK_IN
# h_block = 7
# w_block = 14


# Define tiling sizes automatically
wkld = sche.get_workload(stage0_data_shape, stage0_kernel_shape, stage0_output_shape)
schedules = sche.find_schedules(wkld, env, stage0_output_shape, pool_combined =True, best_only = True)
b_block, oc_block, ic_block, h_block, w_block = schedules[0].get_sche()

# Target stages and params of tile
stages = stage0, stage0_conv, stage0_sc, stage0_pools,\
     stage0_relu, stage0_output_buf, stage0_data_buf, stage0_kernel_buf
params = b_block, oc_block, h_block, w_block, stage0_ic,\
     ic_block, stage0_dy, stage0_dx, stage0_ic_tns

# Tile stages with params
axis = sche.tile(s, stages, params)
print(tvm.lower(s, [stage0_data, stage0_kernel, stage0_output_ph,\
     stage1_kernel, stage1_output_ph, stage2_kernel, stage2], simple_mode=True))
print("+++++++++++++++++")

# Lowering stages with axis ic_out, b_tns
sche.lowering(s, stages, axis, stage_ID = 0, env = env)

######################################################################
# Blocking the Computation : Stage 1

# A feasible tiling sizes
# b_block = 1 // env.BATCH
# oc_block = 16 * 16 // env.BLOCK_OUT // 8 
# ic_block = 16 * 6 // env.BLOCK_IN // 2
# h_block = 5
# w_block = 1

# Define tiling sizes automatically
wkld = sche.get_workload(stage1_data_shape, stage1_kernel_shape, stage1_output_shape)
schedules = sche.find_schedules(wkld, env, stage1_output_shape, pool_combined =True, best_only = True)
b_block, oc_block, ic_block, h_block, w_block = schedules[0].get_sche()

# Target stages and params of tile
stages = stage1, stage1_conv, stage1_sc, stage1_pools,\
     stage1_relu, stage1_output_buf, stage1_data_buf, stage1_kernel_buf
params = b_block, oc_block, h_block, w_block, stage1_ic,\
     ic_block, stage1_dy, stage1_dx, stage1_ic_tns

# Tile stages with params
axis = sche.tile(s, stages, params)
print(tvm.lower(s, [stage0_data, stage0_kernel, stage0_output_ph,\
     stage1_kernel, stage1_output_ph, stage2_kernel, stage2], simple_mode=True))
print("+++++++++++++++++")

# Lowering stages with axis ic_out, b_tns
sche.lowering(s, stages, axis, stage_ID = 1, env = env)

######################################################################
# Blocking the Computation : Stage 2

# Let's define tiling sizes manually because height and width of output is 1
b_block = 1 // env.BATCH
oc_block = 16 * 16 // env.BLOCK_OUT // 8
ic_block = 16 * 16 // env.BLOCK_IN  // 4
h_block = 1
w_block = 1

# Target stages and params of tile
stages = stage2, stage2_conv, stage2_sc, (),\
     stage2_relu, (), stage2_data_buf, stage2_kernel_buf
params = b_block, oc_block, h_block, w_block, stage2_ic,\
     ic_block, stage2_dy, stage2_dx, stage2_ic_tns

# Tile stages with params
axis = sche.tile(s, stages, params)
print(tvm.lower(s, [stage0_data, stage0_kernel, stage0_output_ph,\
     stage1_kernel, stage1_output_ph, stage2_kernel, stage2], simple_mode=True))
print("+++++++++++++++++")

# Lowering stages with axis ic_out, b_tns
sche.lowering(s, stages, axis, stage_ID = 2, env = env)

# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# varec compute intrinsics.
print(varec.lower(s, [stage0_data, stage0_kernel, stage0_output_ph,\
     stage1_kernel, stage1_output_ph, stage2_kernel, stage2], simple_mode=True))

######################################################################
# TVM Compilation and Verification

# This library facilitates 2D convolution testing
from topi.testing import conv2d_nchw_python

# Compile the TVM module
my_conv = varec.build(s, [stage0_data, stage0_kernel, stage0_output_ph,\
     stage1_kernel, stage1_output_ph, stage2_kernel, stage2], "ext_dev", env.target_host, name="my_conv")
temp = util.tempdir()
my_conv.save(temp.relpath("Lenet-5.o"))
remote.upload(temp.relpath("Lenet-5.o"))
f = remote.load_module("Lenet-5.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
stage0_data_np = np.random.randint(
    -128, 128,
    size=(stage0_batch_size, stage0_in_channels, stage0_height, stage0_width)).astype(stage0_data.dtype)
stage0_kernel_np = np.random.randint(
    -128, 128,
    size=(stage0_out_channels, stage0_in_channels, stage0_kernel_h, stage0_kernel_w)).astype(stage0_kernel.dtype)
stage1_kernel_np = np.random.randint(
    -128, 128,
    size=(stage1_out_channels, stage1_in_channels, stage1_kernel_h, stage1_kernel_w)).astype(stage1_kernel.dtype)
stage2_kernel_np = np.random.randint(
    -128, 128,
    size=(stage2_out_channels, stage2_in_channels, stage2_kernel_h, stage2_kernel_w)).astype(stage2_kernel.dtype)
stage0_output_np = np.full(stage0_output_shape, inp_min).astype(stage0_output_ph.dtype)
stage1_output_np = np.full(stage1_output_shape, inp_min).astype(stage0_output_ph.dtype)

# Apply packing to the data and kernel arrays from a 2D NCHW
# to a 4D NCHWnc packed layout
stage0_data_packed = stage0_data_np.reshape(stage0_batch_size // env.BATCH,
                              env.BATCH,
                              stage0_in_channels // env.BLOCK_IN,
                              env.BLOCK_IN,
                              stage0_height,
                              stage0_width).transpose((0, 2, 4, 5, 1, 3))

stage0_kernel_packed = stage0_kernel_np.reshape(stage0_out_channels // env.BLOCK_OUT,
                                  env.BLOCK_OUT,
                                  stage0_in_channels // env.BLOCK_IN,
                                  env.BLOCK_IN,
                                  stage0_kernel_h,
                                  stage0_kernel_w).transpose((0, 2, 4, 5, 1, 3))

stage1_kernel_packed = stage1_kernel_np.reshape(stage1_out_channels // env.BLOCK_OUT,
                                  env.BLOCK_OUT,
                                  stage1_in_channels // env.BLOCK_IN,
                                  env.BLOCK_IN,
                                  stage1_kernel_h,
                                  stage1_kernel_w).transpose((0, 2, 4, 5, 1, 3))

stage2_kernel_packed = stage2_kernel_np.reshape(stage2_out_channels // env.BLOCK_OUT,
                                  env.BLOCK_OUT,
                                  stage2_in_channels // env.BLOCK_IN,
                                  env.BLOCK_IN,
                                  stage2_kernel_h,
                                  stage2_kernel_w).transpose((0, 2, 4, 5, 1, 3))

# Format the input/output arrays with tvm.nd.array to the DLPack standard
stage0_data_nd = tvm.nd.array(stage0_data_packed, ctx)
stage0_kernel_nd = tvm.nd.array(stage0_kernel_packed, ctx)
stage1_kernel_nd = tvm.nd.array(stage1_kernel_packed, ctx)
stage2_kernel_nd = tvm.nd.array(stage2_kernel_packed, ctx)
stage0_output_nd = tvm.nd.array(stage0_output_np, ctx)
stage1_output_nd = tvm.nd.array(stage1_output_np, ctx)
stage2_nd = tvm.nd.array(np.zeros(stage2_output_shape).astype(stage2.dtype), ctx)

# Invoke the module to perform the computation
f(stage0_data_nd, stage0_kernel_nd, stage0_output_nd,\
     stage1_kernel_nd, stage1_output_nd, stage2_kernel_nd, stage2_nd)


# Verify against numpy implementation
stage0_C1S2 = conv2d_nchw_python(stage0_data_np.astype(env.acc_dtype),
                            stage0_kernel_np.astype(env.acc_dtype),
                            (stage0_stride_h, stage0_stride_w),
                            (stage0_pad_h, stage0_pad_w)).astype(env.acc_dtype)
stage0_C1S2 = stage0_C1S2 >> env.INP_WIDTH
stage0_C1S2 = np.clip(stage0_C1S2, inp_min, inp_max)
stage0_C1S2 = stage0_C1S2.astype(stage0.dtype)
stage0_C1S2 = stage0_C1S2.reshape((stage0_batch_size // env.BATCH,
                           env.BATCH,
                           stage0_out_channels // env.BLOCK_OUT,
                           env.BLOCK_OUT,
                           stage0_fout_height,
                           stage0_fout_width)).transpose((0, 2, 4, 5, 1, 3))

params = stage0_output_shape, stage0_batch_size, env, stage0_out_channels, \
    stage0_fout_width, stage0_fout_height, stage0.dtype
    
stage0_ref = db.pool(stage0_C1S2, params)

stage1_data_np = stage0_ref.transpose((0, 4, 1, 5, 2, 3))\
    .reshape(stage1_batch_size, stage1_in_channels, stage1_height, stage1_width)

stage1_C3S4 = conv2d_nchw_python(stage1_data_np.astype(env.acc_dtype),
                            stage1_kernel_np.astype(env.acc_dtype),
                            (stage1_stride_h, stage1_stride_w),
                            (stage1_pad_h, stage1_pad_w)).astype(env.acc_dtype)
stage1_C3S4 = stage1_C3S4 >> env.INP_WIDTH
stage1_C3S4 = np.clip(stage1_C3S4, inp_min, inp_max)
stage1_C3S4 = stage1_C3S4.astype(stage1.dtype)
stage1_C3S4 = stage1_C3S4.reshape((stage1_batch_size // env.BATCH,
                           env.BATCH,
                           stage1_out_channels // env.BLOCK_OUT,
                           env.BLOCK_OUT,
                           stage1_fout_height,
                           stage1_fout_width)).transpose((0, 2, 4, 5, 1, 3))

params = stage1_output_shape, stage1_batch_size, env, stage1_out_channels, \
    stage1_fout_width, stage1_fout_height, stage1.dtype
    
stage1_ref = db.pool(stage1_C3S4, params)

stage2_data_np = stage1_ref.transpose((0, 4, 1, 5, 2, 3))\
    .reshape(stage2_batch_size, stage2_in_channels, stage2_height, stage2_width)

stage2_C5 = conv2d_nchw_python(stage2_data_np.astype(env.acc_dtype),
                            stage2_kernel_np.astype(env.acc_dtype),
                            (stage2_stride_h, stage2_stride_w),
                            (stage2_pad_h, stage2_pad_w)).astype(env.acc_dtype)
stage2_C5 = stage2_C5 >> env.INP_WIDTH
stage2_C5 = np.clip(stage2_C5, inp_min, inp_max)
stage2_C5 = stage2_C5.astype(stage2.dtype)
stage2_C5 = stage2_C5.reshape((stage1_batch_size // env.BATCH,
                           env.BATCH,
                           stage2_out_channels // env.BLOCK_OUT,
                           env.BLOCK_OUT,
                           stage2_fout_height,
                           stage2_fout_width)).transpose((0, 2, 4, 5, 1, 3))
stage2_C5 = np.clip(stage2_C5, 0, inp_max)

tvm.testing.assert_allclose(stage2_C5, stage2_nd.asnumpy())

print("Successful Lenet-5 test!")