|
- /**
- * Copyright (C) 2019-2021 Xilinx, Inc
- *
- * Licensed under the Apache License, Version 2.0 (the "License"). You may
- * not use this file except in compliance with the License. A copy of the
- * License is located at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- */
-
- /********************************************************************************************
- Description:
- This is a HBM bandwidth example using a pseudo random 1024 bit data access
- pattern to mimic Ethereum Ethash workloads.
- The design contains 3 compute units of a kernel, reading 1024 bits from a
- pseudo random address in each of 2 pseudo channels and writing the results of
- a simple mathematical operation to a pseudo random address in 2 other pseudo
- channels.
- To maximize bandwidth the pseudo channels are used in P2P like
- configuration.
- The host application allocates buffers in 12 HBM banks and runs the compute
- units concurrently to measure the overall bandwidth between kernel and HBM
- Memory.
- ******************************************************************************************/
-
- #include <algorithm>
- #include <iostream>
- #include <stdint.h>
- #include <stdlib.h>
- #include <string.h>
- #include <vector>
- #include<stdio.h>
- #include <queue>
- #include <unordered_map>
- #include <fstream>
- #include <sstream>
-
- #include "xcl2.hpp"
-
- #define NUM_CU 16
- #define storeNumToge 8
- // HBM Pseudo-Channel(PC) requirements
- #define MAX_HBM_PC_COUNT 32
- #define PC_NAME(n) n | XCL_MEM_TOPOLOGY
- const int pc[MAX_HBM_PC_COUNT] = {
- PC_NAME(0), PC_NAME(1), PC_NAME(2), PC_NAME(3), PC_NAME(4), PC_NAME(5), PC_NAME(6), PC_NAME(7),
- PC_NAME(8), PC_NAME(9), PC_NAME(10), PC_NAME(11), PC_NAME(12), PC_NAME(13), PC_NAME(14), PC_NAME(15),
- PC_NAME(16), PC_NAME(17), PC_NAME(18), PC_NAME(19), PC_NAME(20), PC_NAME(21), PC_NAME(22), PC_NAME(23),
- PC_NAME(24), PC_NAME(25), PC_NAME(26), PC_NAME(27), PC_NAME(28), PC_NAME(29), PC_NAME(30), PC_NAME(31)};
-
- std::vector<std::vector<uint32_t>> adj_matrix;
- // std::vector<uint32_t, aligned_allocator<uint32_t> > edges(dataSize);
- // std::vector<uint32_t, aligned_allocator<uint32_t> > offset(dataSize);
- std::vector<uint32_t, aligned_allocator<uint32_t> > edges;
- std::vector<uint32_t, aligned_allocator<uint32_t> > offset;
- std::vector<uint32_t, aligned_allocator<uint32_t> > EOmerges;
- // Function for verifying results
- bool verify(
- std::vector<uint32_t, aligned_allocator<uint32_t> >& source_hw_2hop_results,
- unsigned int size,
- unsigned int node_number,
- unsigned int edge_number
- ) {
- bool check = true;
- // for (size_t i = 0; i < size; i++) {
- // i[i] != source_sw_add_results[i]) {
- // std::cout << "Error: Result mismatch in Addition Operation" << std::endl;
- // std::cout << "i = " << i << " CPU result = " << source_sw_add_results[i]
- // << " Device result = " [i] << std::endl;
- // check = false;
- // break;
- // }
- // if (source_hw_mul_results[i] != source_sw_mul_results[i]) {
- // std::cout << "Error: Result mismatch in Multiplication Operation" << std::endl;
- // std::cout << "i = " << i << " CPU result = " << source_sw_mul_results[i]
- // << " Device result = " << source_hw_mul_results[i] << std::endl;
- // check = false;
- // break;
- // }
- // }
- return check;
- }
-
- uint32_t s_to_uint32_t(std::string str) //
- {
- uint32_t result;
- std::istringstream is(str);
- is >> result;
- return result;
- }
-
- void graph_adj_matrix(std::string filename)
- {
- std::ifstream infile(filename);
- std::istringstream iss;
- std::string buff;
- std::unordered_map<uint32_t, int32_t> src_map;
- uint32_t src_id, dst_id;
- uint32_t src_index;
- if(!infile.is_open()){
- std::cout<<"unable to open "<<filename<<"\n";
- }
-
- int edge_count = 0;
- while (getline(infile, buff)){
- // iss.clear();
- // iss.str(buff);
- // iss >> src_id;
- // iss >> dst_id;
-
- std::istringstream sin(buff);
- std::vector<std::string> fields;
- std::string field;
-
- while (getline(sin, field, ','))
- {
- fields.push_back(field);
-
- }
- src_id=s_to_uint32_t(fields[0]);
- dst_id=s_to_uint32_t(fields[1]);
- //***********************dst id debug
- if (src_id==0)
- std::cout<<"dst id"<<dst_id<<"\n";
-
-
-
- //*********************ignore edge weight
- // iss >> edge_weight;
- // if(src_id >= batch_size || dst_id >= batch_size){
- // continue;
- // }
- auto src_map_size = src_map.size();
- src_map.insert({src_id, src_map_size});
- auto it = src_map.find(src_id);
- if(it == src_map.end()){
- src_index = -1;
- }else{
- src_index = it->second;
- }
- // src_index starts from 0 or 1?
- if(src_index < adj_matrix.size()){
- adj_matrix[src_index].emplace_back(dst_id);
- edge_count++;
- }else{
- std::vector<uint32_t> new_neighbor(1, dst_id);
- adj_matrix.push_back(std::move(new_neighbor));
- edge_count++;
- }
- }
- std::cout<<"edge num "<<edge_count<<"\n";
-
-
- }
-
-
- void graph_offset()
- {
- //Get CSR from adj_matrix
- ;
- uint32_t ost=0;
- //debug:March 24
- offset.emplace_back(ost);
- EOmerges.emplace_back(ost);
- for(uint32_t i=0;i<adj_matrix.size();i++)
- {
- ost+=adj_matrix[i].size();
- offset.emplace_back(ost);
- //fill the EOmerges
- EOmerges.emplace_back(ost);
-
- }
-
- //print offset and value
- std::cout<<"size of nodes in offset"<<ost<<" \n";
- //debug
- std::cout<<"size 0"<<" ";
- std::cout<<adj_matrix[0].size()<<"\n";
- std::cout<<"offset 1 - offset 0: "<<(offset[1]-offset[0])<<"\n";
- std::cout<<"size 1"<<" ";
- std::cout<<adj_matrix[1].size()<<"\n";
- std::cout<<"offset 2 - offset 1: "<<(offset[2]-offset[1])<<"\n";
-
- // std::cout<<"adj matrix size:"<<adj_matrix.size()<<"\n";
- // std::vector<uint32_t>::iterator iter;
- // for (iter=adj_matrix[1].begin();iter!=adj_matrix[1].end();iter++)
- // std::cout<<*iter<<" ";
- }
-
- void graph_edge()
- {
- for (uint32_t i=0;i<adj_matrix.size();i++)
- for (uint32_t j=0;j<adj_matrix[i].size();j++)
- {
- edges.emplace_back(adj_matrix[i][j]);
- EOmerges.emplace_back(adj_matrix[i][j]);
- }
- }
-
- // std::vector<cl_mem_ext_ptr_t> P0(NUM_CU);
- // std::vector<cl_mem_ext_ptr_t> P1(NUM_CU);
- // std::vector<cl_mem_ext_ptr_t> P2(NUM_CU);
- // std::vector<cl_mem_ext_ptr_t> P3(NUM_CU);
-
- // std::vector<cl::Buffer> buffer_p0(NUM_CU);
- // std::vector<cl::Buffer> buffer_p1(NUM_CU);
- // std::vector<cl::Buffer> buffer_p2(NUM_CU);
- // std::vector<cl::Buffer> buffer_p3(NUM_CU);
-
- // std::vector<uint32_t, aligned_allocator<uint32_t> > source_hw_1hop_results[NUM_CU];
- // std::vector<uint32_t, aligned_allocator<uint32_t> > source_hw_2hop_results[NUM_CU];
-
- int main(int argc, char* argv[]) {
- if (argc != 2) {
- std::cout << "Usage: " << argv[0] << "<XCLBIN> \n";
- return -1;
- }
-
-
- unsigned int dataSize = 64 * 1024 * 1024; // taking maximum possible data size value for an HBM bank
- // unsigned int num_times = 1024; // num_times specify, number of times a kernel 1024
- // unsigned int num_times = 64; // will execute the same operation. This is
- // needed
- unsigned int num_times = 1;
- // to keep the kernel busy to test the actual bandwidth of all banks running
- // concurrently.
- unsigned int my_node_number=800*1024;
- unsigned int my_CU_num=16;
- unsigned int Hop2_offset=0;
- // reducing the test data capacity to run faster in emulation mode
- if (xcl::is_emulation()) {
- // dataSize = 1024;
- dataSize = 1024*1024;
- num_times = 1;
- my_node_number=8*8;
- }
-
- std::string binaryFile = argv[1];
- cl_int err;
- cl::CommandQueue q;
- std::string krnl_name = "krnl_graph";
- std::string krnl_name_read = "krnl_graph_read";
- std::string krnl_name_write = "krnl_graph_write";
-
- std::vector<cl::Kernel> krnls(2*NUM_CU);
- cl::Context context;
-
-
- // std::vector<uint32_t, aligned_allocator<uint32_t> > source_hw_1hop_results[NUM_CU];
- std::vector<uint32_t, aligned_allocator<uint32_t> > source_hw_2hop_results[NUM_CU];
-
-
-
- //Q0: Modify the input,resize? **********
- for (int i = 0; i < NUM_CU; i++) {
-
- source_hw_2hop_results[i].resize(dataSize);
- }
-
- // resize offset, edges?
-
-
- // Input the graph data
- // std::generate(source_in1.begin(), source_in1.end(), std::rand);
- // std::generate(source_in2.begin(), source_in2.end(), std::rand);
- // filename_neighbor="neighbor.csv"
- // filename_offset="offset.csv"
-
- std::string filename="edge.csv";
- graph_adj_matrix(filename);
-
- graph_offset();
- //calculate the offset_index
- unsigned int node_number=offset.size(); //record the number of the nodes
-
- int node_num_remain=node_number%storeNumToge;
- unsigned int offset_index;
- if(node_num_remain==0)
- offset_index=node_number/storeNumToge;
- else
- {
- offset_index=(node_number+storeNumToge-node_num_remain)/storeNumToge; //?
- for(int k=0;k<(int)(storeNumToge-node_num_remain);k++)
- {
- EOmerges.emplace_back(-1);
- }
- printf("The remainder is %d, Add %d of -1\n",node_num_remain,storeNumToge-node_num_remain);
- }
-
- graph_edge();
- unsigned int edge_number=edges.size();//record the number of the edges
- //check offset
- if(EOmerges[offset_index*storeNumToge]!=edges[0])
- {
- printf("Edges Not Match!!EOmerges=%d, edges=%d\n",EOmerges[offset_index*storeNumToge],edges[0]);
- // std::cout<<"Edges Not Match!!EOmerges="<<EOmerges[offset_index*storeNumToge]<<",edges=",edges[0])<<"\n";
- }
- if(node_num_remain!=0)
- printf("BEFORE EDGES, Should be -1, it is %d,\n",EOmerges[offset_index*storeNumToge-1]);
- if(EOmerges[node_number-1]!=offset[node_number-1])
- printf("Offset Not Match!!EOmerges=%d, offsets=%d\n",EOmerges[node_number-1],offset[node_number-1]);
-
- unsigned int No_CU=0;//start with kernel No.0;
- std::cout<<"The node_number is :"<<node_number<<"\n";
- std::cout<<"The edge_number is :"<<edge_number<<"\n";
- if(node_number>dataSize)
- std::cout<<"The node_number is out of boudary:"<<node_number<<"\n";
- if(edge_number>dataSize)
- std::cout<<"The edge_number is out of boudary:"<<edge_number<<"\n";
- unsigned int total_num=EOmerges.size();
- std::cout<<"The total_size is :"<<total_num<<"\n";
- std::cout<<"The offset_index is :"<<offset_index<<"\n";
- // OPENCL HOST CODE AREA START
- // The get_xil_devices will return vector of Xilinx Devices
- auto devices = xcl::get_xil_devices();
-
- // read_binary_file() command will find the OpenCL binary file created using
- // the
- // V++ compiler load into OpenCL Binary and return pointer to file buffer.
- auto fileBuf = xcl::read_binary_file(binaryFile);
-
- cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
- bool valid_device = false;
- for (unsigned int i = 0; i < devices.size(); i++) {
- auto device = devices[i];
- // Creating Context and Command Queue for selected Device
- OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
- OCL_CHECK(err, q = cl::CommandQueue(context, device,
- CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &err));
-
- std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
- cl::Program program(context, {device}, bins, nullptr, &err);
- if (err != CL_SUCCESS) {
- std::cout << "Failed to program device[" << i << "] with xclbin file!\n";
- } else {
- std::cout << "Device[" << i << "]: program successful!\n";
- // Creating Kernel object using Compute unit names
-
- for (int i = 0; i < NUM_CU; i++) {
- std::string cu_id = std::to_string(i + 1);
-
- std::string krnl_name_full_write =krnl_name_write + ":{" + "krnl_graph_write_" + cu_id + "}";
- std::cout << "Creating a kernel [" << krnl_name_full_write.c_str() << "] for CU(" << i + 1 << ")\n";
-
-
- std::string krnl_name_full_read = krnl_name_read + ":{" + "krnl_graph_read_" + cu_id + "}";
- std::cout << "Creating a kernel [" << krnl_name_full_read.c_str() << "] for CU(" << i + 1 << ")\n";
-
-
- // Here Kernel object is created by specifying kernel name along with
- // compute unit.
- // For such case, this kernel object can only access the specific
- // Compute unit
-
- OCL_CHECK(err, krnls[2*i] = cl::Kernel(program, krnl_name_full_read.c_str(), &err));
- OCL_CHECK(err, krnls[2*i+1] = cl::Kernel(program, krnl_name_full_write.c_str(), &err));
-
- }
- valid_device = true;
- break; // we break because we found a valid device
- }
- }
- if (valid_device == false) {
- std::cout << "Failed to program any device found, exit!\n";
- exit(EXIT_FAILURE);
- }
-
- //Debug1:program device
- std::cout << "Debug1:Program device success!\n";
-
- std::vector<cl_mem_ext_ptr_t> P0(NUM_CU);
- std::vector<cl_mem_ext_ptr_t> P1(NUM_CU);
- // std::vector<cl_mem_ext_ptr_t> P2(NUM_CU);
- // std::vector<cl_mem_ext_ptr_t> P3(NUM_CU);
-
- std::vector<cl::Buffer> buffer_p0(NUM_CU);
- std::vector<cl::Buffer> buffer_p1(NUM_CU);
- // std::vector<cl::Buffer> buffer_p2(NUM_CU);
- // std::vector<cl::Buffer> buffer_p3(NUM_CU);
-
- // For Allocating Buffer to specific Global Memory PC, user has to use
- // cl_mem_ext_ptr_t
- // and provide the PC
- for (int i = 0; i < NUM_CU; i++) {
- //************************************************** Q1:Allocate obj
- P0[i].obj = EOmerges.data();
- P0[i].param = 0;
- P0[i].flags = pc[i * 2];
-
- P1[i].obj = source_hw_2hop_results[i].data();
- P1[i].param = 0;
- P1[i].flags = pc[(i * 2) + 1];
- // P1[i].flags = pc[(i * 2)];
-
-
-
-
- }
- //Debug2:allocate CL buffer
- std::cout << "Debug2:allocate CL buffer success!\n";
-
- //Q2: |CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR?
- // These commands will allocate memory on the FPGA. The cl::Buffer objects can
- // be used to reference the memory locations on the device.
- // Creating Buffers
- for (int i = 0; i < NUM_CU; i++) {
- OCL_CHECK(err,
- buffer_p0[i] = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
- sizeof(uint32_t) * total_num, &P0[i], &err));
- OCL_CHECK(err,
- buffer_p1[i] = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
- sizeof(uint32_t) * dataSize, &P1[i], &err));
-
- }
-
- //Debug3:allocate memory in FPGA
- std::cout << "Debug3:allocate memory in FPGA success!\n";
-
- //Q3: Buffer{buffer_p0[i], buffer_p1[i],,buffer_p2[i], buffer_p3[i]}
- // Copy input data to Device Global Memory
- for (int i = 0; i < NUM_CU; i++) {
- OCL_CHECK(err,
- // err = q.enqueueMigrateMemObjects({buffer_p0[i], buffer_p1[i]}, 0 /* 0 means from host*/));
- err = q.enqueueMigrateMemObjects({buffer_p0[i]}, 0 /* 0 means from host*/));
- }
- q.finish();
-
-
- //Debug4:move data from host to device
- std::cout << "Debug4:move data from host to device success!\n";
-
- double kernel_time_in_sec = 0, result = 0;
-
- std::chrono::duration<double> kernel_time(0);
-
- auto kernel_start = std::chrono::high_resolution_clock::now();
- for (int i = 0; i < NUM_CU; i++) {
- // Setting the k_vadd Arguments
- No_CU=i;
- //read
- OCL_CHECK(err, err = krnls[2*i].setArg(0, No_CU));
- OCL_CHECK(err, err = krnls[2*i].setArg(1, my_CU_num));
- OCL_CHECK(err, err = krnls[2*i].setArg(2, buffer_p0[i]));
- OCL_CHECK(err, err = krnls[2*i].setArg(3, offset_index));
- OCL_CHECK(err, err = krnls[2*i].setArg(4, my_node_number));
- OCL_CHECK(err, err = krnls[2*i].setArg(5, num_times));
-
- // write
- OCL_CHECK(err, err = krnls[2*i+1].setArg(0, No_CU));
- OCL_CHECK(err, err = krnls[2*i+1].setArg(1, my_CU_num));
- OCL_CHECK(err, err = krnls[2*i+1].setArg(2, buffer_p1[i]));
- OCL_CHECK(err, err = krnls[2*i+1].setArg(3, Hop2_offset));
- OCL_CHECK(err, err = krnls[2*i+1].setArg(4, my_node_number));
- OCL_CHECK(err, err = krnls[2*i+1].setArg(5, num_times));
-
- // Invoking the kernel
- OCL_CHECK(err, err = q.enqueueTask(krnls[2*i]));
- OCL_CHECK(err, err = q.enqueueTask(krnls[2*i+1]));
-
-
- }
- q.finish();
- //Debug5:invoke kernel
- std::cout << "Debug5:invoke kernel success!\n";
- auto kernel_end = std::chrono::high_resolution_clock::now();
-
- kernel_time = std::chrono::duration<double>(kernel_end - kernel_start);
-
- kernel_time_in_sec = kernel_time.count();
- // kernel_time_in_sec /= NUM_CU;
-
- // //Q4: Buffer{buffer_p0[i], buffer_p1[i],buffer_p2[i], buffer_p3[i]}
- // // Copy Result from Device Global Memory to Host Local Memory
- // for (int i = 0; i < NUM_CU; i++) {
- // OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_p2[i], buffer_p3[i]},
- // CL_MIGRATE_MEM_OBJECT_HOST));
- // }
- // q.finish();
-
- //Debug6:record kernel time
- std::cout << "Debug6:record kernel time success!\n";
- bool match = true;
-
- for (int i = 0; i < NUM_CU; i++) {
- match = verify(source_hw_2hop_results[i],
- dataSize, node_number,edge_number);
- if (!match) {
- std::cerr << "TEST FAILED" << std::endl;
- return EXIT_FAILURE;
- }
- }
-
- // Multiplying the actual data size by 4 because four buffers are being
- // used.
- // result = 4 * (float)dataSize * num_times * sizeof(uint32_t);
- ////batch*(2+32*(4+10*2))=batch*(2+128+640)
- result = 770 * (float)my_node_number * num_times * sizeof(uint32_t);
- result /= 1000; // to KB
- result /= 1000; // to MB
- result /= 1000; // to GB
- result /= kernel_time_in_sec; // to GBps
-
- std::cout << "Node Num = " << my_node_number << " " << std::endl;
- std::cout << "KERNEL TIME = " << kernel_time_in_sec << " s" << std::endl;
- std::cout << "OVERALL THROUGHPUT = " << result << " GB/s" << std::endl;
- std::cout << "CHANNEL THROUGHPUT = " << result / (NUM_CU * 2) << " GB/s" << std::endl;
-
- std::cout << "TEST PASSED for me" << std::endl;
- return EXIT_SUCCESS;
- }
|